OLD | NEW |
---|---|
1 /* | 1 /* |
2 * ARMv8 NEON optimizations for libjpeg-turbo | 2 * ARMv8 NEON optimizations for libjpeg-turbo |
3 * | 3 * |
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
5 * All rights reserved. | 5 * All rights reserved. |
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
7 * Copyright (C) 2013-2014, Linaro Limited | 7 * Copyright (C) 2013-2014, Linaro Limited |
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> | 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> |
9 * | 9 * |
10 * This software is provided 'as-is', without any express or implied | 10 * This software is provided 'as-is', without any express or implied |
(...skipping 11 matching lines...) Expand all Loading... | |
22 * 2. Altered source versions must be plainly marked as such, and must not be | 22 * 2. Altered source versions must be plainly marked as such, and must not be |
23 * misrepresented as being the original software. | 23 * misrepresented as being the original software. |
24 * 3. This notice may not be removed or altered from any source distribution. | 24 * 3. This notice may not be removed or altered from any source distribution. |
25 */ | 25 */ |
26 | 26 |
27 #if defined(__linux__) && defined(__ELF__) | 27 #if defined(__linux__) && defined(__ELF__) |
28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | 28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ |
29 #endif | 29 #endif |
30 | 30 |
31 .text | 31 .text |
32 #ifndef __clang__ | |
32 .arch armv8-a+fp+simd | 33 .arch armv8-a+fp+simd |
Nico
2016/04/20 20:53:30
again, can we please fix this in clang instead?
| |
34 #endif | |
33 | 35 |
34 | 36 |
35 #define RESPECT_STRICT_ALIGNMENT 1 | 37 #define RESPECT_STRICT_ALIGNMENT 1 |
36 | 38 |
37 | 39 |
38 /*****************************************************************************/ | 40 /*****************************************************************************/ |
39 | 41 |
40 /* Supplementary macro for setting function attributes */ | 42 /* Supplementary macro for setting function attributes */ |
41 .macro asm_function fname | 43 .macro asm_function fname |
42 #ifdef __APPLE__ | 44 #ifdef __APPLE__ |
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
190 tmp0 = q4; \ | 192 tmp0 = q4; \ |
191 tmp1 = q5; \ | 193 tmp1 = q5; \ |
192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ | 194 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
193 tmp3 = q7; \ | 195 tmp3 = q7; \ |
194 tmp10 = q2; \ | 196 tmp10 = q2; \ |
195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ | 197 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
196 tmp12 = q3; \ | 198 tmp12 = q3; \ |
197 tmp13 = q1; \ | 199 tmp13 = q1; \ |
198 } | 200 } |
199 | 201 |
200 #define XFIX_0_899976223 v0.4h[0] | 202 #define XFIX_0_899976223 v0.h[0] |
201 #define XFIX_0_541196100 v0.4h[1] | 203 #define XFIX_0_541196100 v0.h[1] |
202 #define XFIX_2_562915447 v0.4h[2] | 204 #define XFIX_2_562915447 v0.h[2] |
203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] | 205 #define XFIX_0_298631336_MINUS_0_899976223 v0.h[3] |
204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] | 206 #define XFIX_1_501321110_MINUS_0_899976223 v1.h[0] |
205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] | 207 #define XFIX_2_053119869_MINUS_2_562915447 v1.h[1] |
206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] | 208 #define XFIX_0_541196100_PLUS_0_765366865 v1.h[2] |
207 #define XFIX_1_175875602 v1.4h[3] | 209 #define XFIX_1_175875602 v1.h[3] |
208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] | 210 #define XFIX_1_175875602_MINUS_0_390180644 v2.h[0] |
209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] | 211 #define XFIX_0_541196100_MINUS_1_847759065 v2.h[1] |
210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] | 212 #define XFIX_3_072711026_MINUS_2_562915447 v2.h[2] |
211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] | 213 #define XFIX_1_175875602_MINUS_1_961570560 v2.h[3] |
212 | 214 |
213 .balign 16 | 215 .balign 16 |
214 jsimd_idct_islow_neon_consts: | 216 jsimd_idct_islow_neon_consts: |
215 .short FIX_0_899976223 /* d0[0] */ | 217 .short FIX_0_899976223 /* d0[0] */ |
216 .short FIX_0_541196100 /* d0[1] */ | 218 .short FIX_0_541196100 /* d0[1] */ |
217 .short FIX_2_562915447 /* d0[2] */ | 219 .short FIX_2_562915447 /* d0[2] */ |
218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ | 220 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ | 221 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ | 222 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ | 223 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
263 st1 {v12.8b - v15.8b}, [sp], 32 | 265 st1 {v12.8b - v15.8b}, [sp], 32 |
264 st1 {v16.8b - v19.8b}, [sp], 32 | 266 st1 {v16.8b - v19.8b}, [sp], 32 |
265 st1 {v20.8b - v23.8b}, [sp], 32 | 267 st1 {v20.8b - v23.8b}, [sp], 32 |
266 st1 {v24.8b - v27.8b}, [sp], 32 | 268 st1 {v24.8b - v27.8b}, [sp], 32 |
267 st1 {v28.8b - v31.8b}, [sp], 32 | 269 st1 {v28.8b - v31.8b}, [sp], 32 |
268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 | 270 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 |
269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | 271 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 |
270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 | 272 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 |
271 mul v16.4h, v16.4h, v0.4h | 273 mul v16.4h, v16.4h, v0.4h |
272 mul v17.4h, v17.4h, v1.4h | 274 mul v17.4h, v17.4h, v1.4h |
273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ | 275 ins v16.d[1], v17.d[0] /* 128 bit q8 */ |
274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | 276 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 |
275 mul v18.4h, v18.4h, v2.4h | 277 mul v18.4h, v18.4h, v2.4h |
276 mul v19.4h, v19.4h, v3.4h | 278 mul v19.4h, v19.4h, v3.4h |
277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ | 279 ins v18.d[1], v19.d[0] /* 128 bit q9 */ |
278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 | 280 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 |
279 mul v20.4h, v20.4h, v4.4h | 281 mul v20.4h, v20.4h, v4.4h |
280 mul v21.4h, v21.4h, v5.4h | 282 mul v21.4h, v21.4h, v5.4h |
281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ | 283 ins v20.d[1], v21.d[0] /* 128 bit q10 */ |
282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | 284 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 |
283 mul v22.4h, v22.4h, v6.4h | 285 mul v22.4h, v22.4h, v6.4h |
284 mul v23.4h, v23.4h, v7.4h | 286 mul v23.4h, v23.4h, v7.4h |
285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ | 287 ins v22.d[1], v23.d[0] /* 128 bit q11 */ |
286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] | 288 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] |
287 mul v24.4h, v24.4h, v0.4h | 289 mul v24.4h, v24.4h, v0.4h |
288 mul v25.4h, v25.4h, v1.4h | 290 mul v25.4h, v25.4h, v1.4h |
289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ | 291 ins v24.d[1], v25.d[0] /* 128 bit q12 */ |
290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | 292 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 |
291 mul v28.4h, v28.4h, v4.4h | 293 mul v28.4h, v28.4h, v4.4h |
292 mul v29.4h, v29.4h, v5.4h | 294 mul v29.4h, v29.4h, v5.4h |
293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ | 295 ins v28.d[1], v29.d[0] /* 128 bit q14 */ |
294 mul v26.4h, v26.4h, v2.4h | 296 mul v26.4h, v26.4h, v2.4h |
295 mul v27.4h, v27.4h, v3.4h | 297 mul v27.4h, v27.4h, v3.4h |
296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ | 298 ins v26.d[1], v27.d[0] /* 128 bit q13 */ |
297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ | 299 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ |
298 add x15, x15, #16 | 300 add x15, x15, #16 |
299 mul v30.4h, v30.4h, v6.4h | 301 mul v30.4h, v30.4h, v6.4h |
300 mul v31.4h, v31.4h, v7.4h | 302 mul v31.4h, v31.4h, v7.4h |
301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ | 303 ins v30.d[1], v31.d[0] /* 128 bit q15 */ |
302 /* Go to the bottom of the stack */ | 304 /* Go to the bottom of the stack */ |
303 sub sp, sp, 352 | 305 sub sp, sp, 352 |
304 stp x4, x5, [sp], 16 | 306 stp x4, x5, [sp], 16 |
305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ | 307 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ |
306 st1 {v12.4h - v15.4h}, [sp], 32 | 308 st1 {v12.4h - v15.4h}, [sp], 32 |
307 /* 1-D IDCT, pass 1, left 4x8 half */ | 309 /* 1-D IDCT, pass 1, left 4x8 half */ |
308 add v4.4h, ROW7L.4h, ROW3L.4h | 310 add v4.4h, ROW7L.4h, ROW3L.4h |
309 add v5.4h, ROW5L.4h, ROW1L.4h | 311 add v5.4h, ROW5L.4h, ROW1L.4h |
310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 | 312 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 |
311 smlal v12.4s, v5.4h, XFIX_1_175875602 | 313 smlal v12.4s, v5.4h, XFIX_1_175875602 |
(...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
546 add v12.4s, v4.4s, v14.4s | 548 add v12.4s, v4.4s, v14.4s |
547 sub v4.4s, v4.4s, v14.4s | 549 sub v4.4s, v4.4s, v14.4s |
548 add v10.4s, v2.4s, v8.4s | 550 add v10.4s, v2.4s, v8.4s |
549 sub v6.4s, v2.4s, v8.4s | 551 sub v6.4s, v2.4s, v8.4s |
550 shrn ROW7R.4h, v4.4s, #16 | 552 shrn ROW7R.4h, v4.4s, #16 |
551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | 553 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ |
552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | 554 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ |
553 shrn ROW4R.4h, v6.4s, #16 | 555 shrn ROW4R.4h, v6.4s, #16 |
554 | 556 |
555 2: /* Descale to 8-bit and range limit */ | 557 2: /* Descale to 8-bit and range limit */ |
556 ins v16.2d[1], v17.2d[0] | 558 ins v16.d[1], v17.d[0] |
557 ins v18.2d[1], v19.2d[0] | 559 ins v18.d[1], v19.d[0] |
558 ins v20.2d[1], v21.2d[0] | 560 ins v20.d[1], v21.d[0] |
559 ins v22.2d[1], v23.2d[0] | 561 ins v22.d[1], v23.d[0] |
560 sqrshrn v16.8b, v16.8h, #2 | 562 sqrshrn v16.8b, v16.8h, #2 |
561 sqrshrn2 v16.16b, v18.8h, #2 | 563 sqrshrn2 v16.16b, v18.8h, #2 |
562 sqrshrn v18.8b, v20.8h, #2 | 564 sqrshrn v18.8b, v20.8h, #2 |
563 sqrshrn2 v18.16b, v22.8h, #2 | 565 sqrshrn2 v18.16b, v22.8h, #2 |
564 | 566 |
565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ | 567 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ |
566 ld1 {v8.4h - v11.4h}, [sp], 32 | 568 ld1 {v8.4h - v11.4h}, [sp], 32 |
567 ld1 {v12.4h - v15.4h}, [sp], 32 | 569 ld1 {v12.4h - v15.4h}, [sp], 32 |
568 ins v24.2d[1], v25.2d[0] | 570 ins v24.d[1], v25.d[0] |
569 | 571 |
570 sqrshrn v20.8b, v24.8h, #2 | 572 sqrshrn v20.8b, v24.8h, #2 |
571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ | 573 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
572 /* trn1 v16.8h, v16.8h, v18.8h */ | 574 /* trn1 v16.8h, v16.8h, v18.8h */ |
573 transpose v16, v18, v3, .16b, .8h | 575 transpose v16, v18, v3, .16b, .8h |
574 ins v26.2d[1], v27.2d[0] | 576 ins v26.d[1], v27.d[0] |
575 ins v28.2d[1], v29.2d[0] | 577 ins v28.d[1], v29.d[0] |
576 ins v30.2d[1], v31.2d[0] | 578 ins v30.d[1], v31.d[0] |
577 sqrshrn2 v20.16b, v26.8h, #2 | 579 sqrshrn2 v20.16b, v26.8h, #2 |
578 sqrshrn v22.8b, v28.8h, #2 | 580 sqrshrn v22.8b, v28.8h, #2 |
579 movi v0.16b, #(CENTERJSAMPLE) | 581 movi v0.16b, #(CENTERJSAMPLE) |
580 sqrshrn2 v22.16b, v30.8h, #2 | 582 sqrshrn2 v22.16b, v30.8h, #2 |
581 transpose_single v16, v17, v3, .2d, .8b | 583 transpose_single v16, v17, v3, .d, .8b |
582 transpose_single v18, v19, v3, .2d, .8b | 584 transpose_single v18, v19, v3, .d, .8b |
583 add v16.8b, v16.8b, v0.8b | 585 add v16.8b, v16.8b, v0.8b |
584 add v17.8b, v17.8b, v0.8b | 586 add v17.8b, v17.8b, v0.8b |
585 add v18.8b, v18.8b, v0.8b | 587 add v18.8b, v18.8b, v0.8b |
586 add v19.8b, v19.8b, v0.8b | 588 add v19.8b, v19.8b, v0.8b |
587 transpose v20, v22, v3, .16b, .8h | 589 transpose v20, v22, v3, .16b, .8h |
588 /* Store results to the output buffer */ | 590 /* Store results to the output buffer */ |
589 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 591 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
590 add TMP1, TMP1, OUTPUT_COL | 592 add TMP1, TMP1, OUTPUT_COL |
591 add TMP2, TMP2, OUTPUT_COL | 593 add TMP2, TMP2, OUTPUT_COL |
592 st1 {v16.8b}, [TMP1] | 594 st1 {v16.8b}, [TMP1] |
593 transpose_single v20, v21, v3, .2d, .8b | 595 transpose_single v20, v21, v3, .d, .8b |
594 st1 {v17.8b}, [TMP2] | 596 st1 {v17.8b}, [TMP2] |
595 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 597 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
596 add TMP1, TMP1, OUTPUT_COL | 598 add TMP1, TMP1, OUTPUT_COL |
597 add TMP2, TMP2, OUTPUT_COL | 599 add TMP2, TMP2, OUTPUT_COL |
598 st1 {v18.8b}, [TMP1] | 600 st1 {v18.8b}, [TMP1] |
599 add v20.8b, v20.8b, v0.8b | 601 add v20.8b, v20.8b, v0.8b |
600 add v21.8b, v21.8b, v0.8b | 602 add v21.8b, v21.8b, v0.8b |
601 st1 {v19.8b}, [TMP2] | 603 st1 {v19.8b}, [TMP2] |
602 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 604 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
603 ldp TMP3, TMP4, [OUTPUT_BUF] | 605 ldp TMP3, TMP4, [OUTPUT_BUF] |
604 add TMP1, TMP1, OUTPUT_COL | 606 add TMP1, TMP1, OUTPUT_COL |
605 add TMP2, TMP2, OUTPUT_COL | 607 add TMP2, TMP2, OUTPUT_COL |
606 add TMP3, TMP3, OUTPUT_COL | 608 add TMP3, TMP3, OUTPUT_COL |
607 add TMP4, TMP4, OUTPUT_COL | 609 add TMP4, TMP4, OUTPUT_COL |
608 transpose_single v22, v23, v3, .2d, .8b | 610 transpose_single v22, v23, v3, .d, .8b |
609 st1 {v20.8b}, [TMP1] | 611 st1 {v20.8b}, [TMP1] |
610 add v22.8b, v22.8b, v0.8b | 612 add v22.8b, v22.8b, v0.8b |
611 add v23.8b, v23.8b, v0.8b | 613 add v23.8b, v23.8b, v0.8b |
612 st1 {v21.8b}, [TMP2] | 614 st1 {v21.8b}, [TMP2] |
613 st1 {v22.8b}, [TMP3] | 615 st1 {v22.8b}, [TMP3] |
614 st1 {v23.8b}, [TMP4] | 616 st1 {v23.8b}, [TMP4] |
615 ldr x15, [sp], 16 | 617 ldr x15, [sp], 16 |
616 ld1 {v0.8b - v3.8b}, [sp], 32 | 618 ld1 {v0.8b - v3.8b}, [sp], 32 |
617 ld1 {v4.8b - v7.8b}, [sp], 32 | 619 ld1 {v4.8b - v7.8b}, [sp], 32 |
618 ld1 {v8.8b - v11.8b}, [sp], 32 | 620 ld1 {v8.8b - v11.8b}, [sp], 32 |
(...skipping 13 matching lines...) Expand all Loading... | |
632 transpose ROW4L, ROW5L, v3, .16b, .4h | 634 transpose ROW4L, ROW5L, v3, .16b, .4h |
633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ | 635 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ |
634 transpose ROW1L, ROW3L, v3, .16b, .2s | 636 transpose ROW1L, ROW3L, v3, .16b, .2s |
635 transpose ROW4L, ROW6L, v3, .16b, .2s | 637 transpose ROW4L, ROW6L, v3, .16b, .2s |
636 transpose ROW0L, ROW2L, v3, .16b, .2s | 638 transpose ROW0L, ROW2L, v3, .16b, .2s |
637 transpose ROW5L, ROW7L, v3, .16b, .2s | 639 transpose ROW5L, ROW7L, v3, .16b, .2s |
638 cmp x0, #0 | 640 cmp x0, #0 |
639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */ | 641 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */ |
640 | 642 |
641 /* Only row 0 is non-zero for the right 4x8 half */ | 643 /* Only row 0 is non-zero for the right 4x8 half */ |
642 dup ROW1R.4h, ROW0R.4h[1] | 644 dup ROW1R.4h, ROW0R.h[1] |
643 dup ROW2R.4h, ROW0R.4h[2] | 645 dup ROW2R.4h, ROW0R.h[2] |
644 dup ROW3R.4h, ROW0R.4h[3] | 646 dup ROW3R.4h, ROW0R.h[3] |
645 dup ROW4R.4h, ROW0R.4h[0] | 647 dup ROW4R.4h, ROW0R.h[0] |
646 dup ROW5R.4h, ROW0R.4h[1] | 648 dup ROW5R.4h, ROW0R.h[1] |
647 dup ROW6R.4h, ROW0R.4h[2] | 649 dup ROW6R.4h, ROW0R.h[2] |
648 dup ROW7R.4h, ROW0R.4h[3] | 650 dup ROW7R.4h, ROW0R.h[3] |
649 dup ROW0R.4h, ROW0R.4h[0] | 651 dup ROW0R.4h, ROW0R.h[0] |
650 b 1b /* Go to 'normal' second pass */ | 652 b 1b /* Go to 'normal' second pass */ |
651 | 653 |
652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ | 654 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
653 ld1 {v2.4h}, [x15] /* reload constants */ | 655 ld1 {v2.4h}, [x15] /* reload constants */ |
654 smull v12.4s, ROW1L.4h, XFIX_1_175875602 | 656 smull v12.4s, ROW1L.4h, XFIX_1_175875602 |
655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | 657 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 |
656 smull v14.4s, ROW3L.4h, XFIX_1_175875602 | 658 smull v14.4s, ROW3L.4h, XFIX_1_175875602 |
657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | 659 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 |
658 smull v4.4s, ROW2L.4h, XFIX_0_541196100 | 660 smull v4.4s, ROW2L.4h, XFIX_0_541196100 |
659 sshll v6.4s, ROW0L.4h, #13 | 661 sshll v6.4s, ROW0L.4h, #13 |
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
763 * function from jidctfst.c | 765 * function from jidctfst.c |
764 * | 766 * |
765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. | 767 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
766 * But in ARM NEON case some extra additions are required because VQDMULH | 768 * But in ARM NEON case some extra additions are required because VQDMULH |
767 * instruction can't handle the constants larger than 1. So the expressions | 769 * instruction can't handle the constants larger than 1. So the expressions |
768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", | 770 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
769 * which introduces an extra addition. Overall, there are 6 extra additions | 771 * which introduces an extra addition. Overall, there are 6 extra additions |
770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. | 772 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
771 */ | 773 */ |
772 | 774 |
773 #define XFIX_1_082392200 v0.4h[0] | 775 #define XFIX_1_082392200 v0.h[0] |
774 #define XFIX_1_414213562 v0.4h[1] | 776 #define XFIX_1_414213562 v0.h[1] |
775 #define XFIX_1_847759065 v0.4h[2] | 777 #define XFIX_1_847759065 v0.h[2] |
776 #define XFIX_2_613125930 v0.4h[3] | 778 #define XFIX_2_613125930 v0.h[3] |
777 | 779 |
778 .balign 16 | 780 .balign 16 |
779 jsimd_idct_ifast_neon_consts: | 781 jsimd_idct_ifast_neon_consts: |
780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | 782 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | 783 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | 784 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | 785 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
784 | 786 |
785 asm_function jsimd_idct_ifast_neon | 787 asm_function jsimd_idct_ifast_neon |
786 | 788 |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
902 /* Transpose q8-q10 */ | 904 /* Transpose q8-q10 */ |
903 mov v18.16b, v8.16b | 905 mov v18.16b, v8.16b |
904 trn1 v8.4s, v8.4s, v10.4s | 906 trn1 v8.4s, v8.4s, v10.4s |
905 trn2 v10.4s, v18.4s, v10.4s | 907 trn2 v10.4s, v18.4s, v10.4s |
906 /* Transpose q13-q15 */ | 908 /* Transpose q13-q15 */ |
907 mov v18.16b, v13.16b | 909 mov v18.16b, v13.16b |
908 trn1 v13.4s, v13.4s, v15.4s | 910 trn1 v13.4s, v13.4s, v15.4s |
909 trn2 v15.4s, v18.4s, v15.4s | 911 trn2 v15.4s, v18.4s, v15.4s |
910 /* vswp v14.4h, v10-MSB.4h */ | 912 /* vswp v14.4h, v10-MSB.4h */ |
911 umov x22, v14.d[0] | 913 umov x22, v14.d[0] |
912 ins v14.2d[0], v10.2d[1] | 914 ins v14.d[0], v10.d[1] |
913 ins v10.2d[1], x22 | 915 ins v10.d[1], x22 |
914 /* vswp v13.4h, v9MSB.4h */ | 916 /* vswp v13.4h, v9MSB.4h */ |
915 | 917 |
916 umov x22, v13.d[0] | 918 umov x22, v13.d[0] |
917 ins v13.2d[0], v9.2d[1] | 919 ins v13.d[0], v9.d[1] |
918 ins v9.2d[1], x22 | 920 ins v9.d[1], x22 |
919 /* 1-D IDCT, pass 2 */ | 921 /* 1-D IDCT, pass 2 */ |
920 sub v2.8h, v10.8h, v14.8h | 922 sub v2.8h, v10.8h, v14.8h |
921 /* vswp v15.4h, v11MSB.4h */ | 923 /* vswp v15.4h, v11MSB.4h */ |
922 umov x22, v15.d[0] | 924 umov x22, v15.d[0] |
923 ins v15.2d[0], v11.2d[1] | 925 ins v15.d[0], v11.d[1] |
924 ins v11.2d[1], x22 | 926 ins v11.d[1], x22 |
925 add v14.8h, v10.8h, v14.8h | 927 add v14.8h, v10.8h, v14.8h |
926 /* vswp v12.4h, v8-MSB.4h */ | 928 /* vswp v12.4h, v8-MSB.4h */ |
927 umov x22, v12.d[0] | 929 umov x22, v12.d[0] |
928 ins v12.2d[0], v8.2d[1] | 930 ins v12.d[0], v8.d[1] |
929 ins v8.2d[1], x22 | 931 ins v8.d[1], x22 |
930 sub v1.8h, v11.8h, v13.8h | 932 sub v1.8h, v11.8h, v13.8h |
931 add v13.8h, v11.8h, v13.8h | 933 add v13.8h, v11.8h, v13.8h |
932 sub v5.8h, v9.8h, v15.8h | 934 sub v5.8h, v9.8h, v15.8h |
933 add v15.8h, v9.8h, v15.8h | 935 add v15.8h, v9.8h, v15.8h |
934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | 936 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 |
935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | 937 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 |
936 add v3.8h, v1.8h, v1.8h | 938 add v3.8h, v1.8h, v1.8h |
937 sub v1.8h, v5.8h, v1.8h | 939 sub v1.8h, v5.8h, v1.8h |
938 add v10.8h, v2.8h, v4.8h | 940 add v10.8h, v2.8h, v4.8h |
939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | 941 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
990 trn2 v11.8h, v18.8h, v11.8h | 992 trn2 v11.8h, v18.8h, v11.8h |
991 /* Transpose q8-q10 */ | 993 /* Transpose q8-q10 */ |
992 mov v18.16b, v8.16b | 994 mov v18.16b, v8.16b |
993 trn1 v8.4s, v8.4s, v10.4s | 995 trn1 v8.4s, v8.4s, v10.4s |
994 trn2 v10.4s, v18.4s, v10.4s | 996 trn2 v10.4s, v18.4s, v10.4s |
995 /* Transpose q9-q11 */ | 997 /* Transpose q9-q11 */ |
996 mov v18.16b, v9.16b | 998 mov v18.16b, v9.16b |
997 trn1 v9.4s, v9.4s, v11.4s | 999 trn1 v9.4s, v9.4s, v11.4s |
998 trn2 v11.4s, v18.4s, v11.4s | 1000 trn2 v11.4s, v18.4s, v11.4s |
999 /* make copy */ | 1001 /* make copy */ |
1000 ins v17.2d[0], v8.2d[1] | 1002 ins v17.d[0], v8.d[1] |
1001 /* Transpose d16-d17-msb */ | 1003 /* Transpose d16-d17-msb */ |
1002 mov v18.16b, v8.16b | 1004 mov v18.16b, v8.16b |
1003 trn1 v8.8b, v8.8b, v17.8b | 1005 trn1 v8.8b, v8.8b, v17.8b |
1004 trn2 v17.8b, v18.8b, v17.8b | 1006 trn2 v17.8b, v18.8b, v17.8b |
1005 /* make copy */ | 1007 /* make copy */ |
1006 ins v19.2d[0], v9.2d[1] | 1008 ins v19.d[0], v9.d[1] |
1007 mov v18.16b, v9.16b | 1009 mov v18.16b, v9.16b |
1008 trn1 v9.8b, v9.8b, v19.8b | 1010 trn1 v9.8b, v9.8b, v19.8b |
1009 trn2 v19.8b, v18.8b, v19.8b | 1011 trn2 v19.8b, v18.8b, v19.8b |
1010 /* Store results to the output buffer */ | 1012 /* Store results to the output buffer */ |
1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1013 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
1012 add TMP1, TMP1, OUTPUT_COL | 1014 add TMP1, TMP1, OUTPUT_COL |
1013 add TMP2, TMP2, OUTPUT_COL | 1015 add TMP2, TMP2, OUTPUT_COL |
1014 st1 {v8.8b}, [TMP1] | 1016 st1 {v8.8b}, [TMP1] |
1015 st1 {v17.8b}, [TMP2] | 1017 st1 {v17.8b}, [TMP2] |
1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1018 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
1017 add TMP1, TMP1, OUTPUT_COL | 1019 add TMP1, TMP1, OUTPUT_COL |
1018 add TMP2, TMP2, OUTPUT_COL | 1020 add TMP2, TMP2, OUTPUT_COL |
1019 st1 {v9.8b}, [TMP1] | 1021 st1 {v9.8b}, [TMP1] |
1020 /* make copy */ | 1022 /* make copy */ |
1021 ins v7.2d[0], v10.2d[1] | 1023 ins v7.d[0], v10.d[1] |
1022 mov v18.16b, v10.16b | 1024 mov v18.16b, v10.16b |
1023 trn1 v10.8b, v10.8b, v7.8b | 1025 trn1 v10.8b, v10.8b, v7.8b |
1024 trn2 v7.8b, v18.8b, v7.8b | 1026 trn2 v7.8b, v18.8b, v7.8b |
1025 st1 {v19.8b}, [TMP2] | 1027 st1 {v19.8b}, [TMP2] |
1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1028 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16 | 1029 ldp TMP4, TMP5, [OUTPUT_BUF], 16 |
1028 add TMP1, TMP1, OUTPUT_COL | 1030 add TMP1, TMP1, OUTPUT_COL |
1029 add TMP2, TMP2, OUTPUT_COL | 1031 add TMP2, TMP2, OUTPUT_COL |
1030 add TMP4, TMP4, OUTPUT_COL | 1032 add TMP4, TMP4, OUTPUT_COL |
1031 add TMP5, TMP5, OUTPUT_COL | 1033 add TMP5, TMP5, OUTPUT_COL |
1032 st1 {v10.8b}, [TMP1] | 1034 st1 {v10.8b}, [TMP1] |
1033 /* make copy */ | 1035 /* make copy */ |
1034 ins v16.2d[0], v11.2d[1] | 1036 ins v16.d[0], v11.d[1] |
1035 mov v18.16b, v11.16b | 1037 mov v18.16b, v11.16b |
1036 trn1 v11.8b, v11.8b, v16.8b | 1038 trn1 v11.8b, v11.8b, v16.8b |
1037 trn2 v16.8b, v18.8b, v16.8b | 1039 trn2 v16.8b, v18.8b, v16.8b |
1038 st1 {v7.8b}, [TMP2] | 1040 st1 {v7.8b}, [TMP2] |
1039 st1 {v11.8b}, [TMP4] | 1041 st1 {v11.8b}, [TMP4] |
1040 st1 {v16.8b}, [TMP5] | 1042 st1 {v16.8b}, [TMP5] |
1041 sub sp, sp, #176 | 1043 sub sp, sp, #176 |
1042 ldp x22, x23, [sp], 16 | 1044 ldp x22, x23, [sp], 16 |
1043 ld1 {v0.8b - v3.8b}, [sp], 32 | 1045 ld1 {v0.8b - v3.8b}, [sp], 32 |
1044 ld1 {v4.8b - v7.8b}, [sp], 32 | 1046 ld1 {v4.8b - v7.8b}, [sp], 32 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ | 1091 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ | 1092 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ | 1093 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ | 1094 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ | 1095 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ | 1096 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ | 1097 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
1096 | 1098 |
1097 .balign 16 | 1099 .balign 16 |
1098 jsimd_idct_4x4_neon_consts: | 1100 jsimd_idct_4x4_neon_consts: |
1099 .short FIX_1_847759065 /* v0.4h[0] */ | 1101 .short FIX_1_847759065 /* v0.h[0] */ |
1100 .short -FIX_0_765366865 /* v0.4h[1] */ | 1102 .short -FIX_0_765366865 /* v0.h[1] */ |
1101 .short -FIX_0_211164243 /* v0.4h[2] */ | 1103 .short -FIX_0_211164243 /* v0.h[2] */ |
1102 .short FIX_1_451774981 /* v0.4h[3] */ | 1104 .short FIX_1_451774981 /* v0.h[3] */ |
1103 .short -FIX_2_172734803 /* d1[0] */ | 1105 .short -FIX_2_172734803 /* d1[0] */ |
1104 .short FIX_1_061594337 /* d1[1] */ | 1106 .short FIX_1_061594337 /* d1[1] */ |
1105 .short -FIX_0_509795579 /* d1[2] */ | 1107 .short -FIX_0_509795579 /* d1[2] */ |
1106 .short -FIX_0_601344887 /* d1[3] */ | 1108 .short -FIX_0_601344887 /* d1[3] */ |
1107 .short FIX_0_899976223 /* v2.4h[0] */ | 1109 .short FIX_0_899976223 /* v2.h[0] */ |
1108 .short FIX_2_562915447 /* v2.4h[1] */ | 1110 .short FIX_2_562915447 /* v2.h[1] */ |
1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */ | 1111 .short 1 << (CONST_BITS+1) /* v2.h[2] */ |
1110 .short 0 /* v2.4h[3] */ | 1112 .short 0 /* v2.h[3] */ |
1111 | 1113 |
1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | 1114 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
1113 smull v28.4s, \x4, v2.4h[2] | 1115 smull v28.4s, \x4, v2.h[2] |
1114 smlal v28.4s, \x8, v0.4h[0] | 1116 smlal v28.4s, \x8, v0.h[0] |
1115 smlal v28.4s, \x14, v0.4h[1] | 1117 smlal v28.4s, \x14, v0.h[1] |
1116 | 1118 |
1117 smull v26.4s, \x16, v1.4h[2] | 1119 smull v26.4s, \x16, v1.h[2] |
1118 smlal v26.4s, \x12, v1.4h[3] | 1120 smlal v26.4s, \x12, v1.h[3] |
1119 smlal v26.4s, \x10, v2.4h[0] | 1121 smlal v26.4s, \x10, v2.h[0] |
1120 smlal v26.4s, \x6, v2.4h[1] | 1122 smlal v26.4s, \x6, v2.h[1] |
1121 | 1123 |
1122 smull v30.4s, \x4, v2.4h[2] | 1124 smull v30.4s, \x4, v2.h[2] |
1123 smlsl v30.4s, \x8, v0.4h[0] | 1125 smlsl v30.4s, \x8, v0.h[0] |
1124 smlsl v30.4s, \x14, v0.4h[1] | 1126 smlsl v30.4s, \x14, v0.h[1] |
1125 | 1127 |
1126 smull v24.4s, \x16, v0.4h[2] | 1128 smull v24.4s, \x16, v0.h[2] |
1127 smlal v24.4s, \x12, v0.4h[3] | 1129 smlal v24.4s, \x12, v0.h[3] |
1128 smlal v24.4s, \x10, v1.4h[0] | 1130 smlal v24.4s, \x10, v1.h[0] |
1129 smlal v24.4s, \x6, v1.4h[1] | 1131 smlal v24.4s, \x6, v1.h[1] |
1130 | 1132 |
1131 add v20.4s, v28.4s, v26.4s | 1133 add v20.4s, v28.4s, v26.4s |
1132 sub v28.4s, v28.4s, v26.4s | 1134 sub v28.4s, v28.4s, v26.4s |
1133 | 1135 |
1134 .if \shift > 16 | 1136 .if \shift > 16 |
1135 srshr v20.4s, v20.4s, #\shift | 1137 srshr v20.4s, v20.4s, #\shift |
1136 srshr v28.4s, v28.4s, #\shift | 1138 srshr v28.4s, v28.4s, #\shift |
1137 xtn \y26, v20.4s | 1139 xtn \y26, v20.4s |
1138 xtn \y29, v28.4s | 1140 xtn \y29, v28.4s |
1139 .else | 1141 .else |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1196 */ | 1198 */ |
1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | 1199 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 |
1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 | 1200 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 |
1199 add COEF_BLOCK, COEF_BLOCK, #16 | 1201 add COEF_BLOCK, COEF_BLOCK, #16 |
1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 | 1202 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 |
1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | 1203 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 |
1202 /* dequantize */ | 1204 /* dequantize */ |
1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | 1205 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 |
1204 mul v4.4h, v4.4h, v18.4h | 1206 mul v4.4h, v4.4h, v18.4h |
1205 mul v5.4h, v5.4h, v19.4h | 1207 mul v5.4h, v5.4h, v19.4h |
1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ | 1208 ins v4.d[1], v5.d[0] /* 128 bit q4 */ |
1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 | 1209 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 |
1208 mul v6.4h, v6.4h, v20.4h | 1210 mul v6.4h, v6.4h, v20.4h |
1209 mul v7.4h, v7.4h, v21.4h | 1211 mul v7.4h, v7.4h, v21.4h |
1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ | 1212 ins v6.d[1], v7.d[0] /* 128 bit q6 */ |
1211 mul v8.4h, v8.4h, v22.4h | 1213 mul v8.4h, v8.4h, v22.4h |
1212 mul v9.4h, v9.4h, v23.4h | 1214 mul v9.4h, v9.4h, v23.4h |
1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ | 1215 ins v8.d[1], v9.d[0] /* 128 bit q8 */ |
1214 add DCT_TABLE, DCT_TABLE, #16 | 1216 add DCT_TABLE, DCT_TABLE, #16 |
1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 | 1217 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 |
1216 mul v10.4h, v10.4h, v24.4h | 1218 mul v10.4h, v10.4h, v24.4h |
1217 mul v11.4h, v11.4h, v25.4h | 1219 mul v11.4h, v11.4h, v25.4h |
1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ | 1220 ins v10.d[1], v11.d[0] /* 128 bit q10 */ |
1219 mul v12.4h, v12.4h, v26.4h | 1221 mul v12.4h, v12.4h, v26.4h |
1220 mul v13.4h, v13.4h, v27.4h | 1222 mul v13.4h, v13.4h, v27.4h |
1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ | 1223 ins v12.d[1], v13.d[0] /* 128 bit q12 */ |
1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | 1224 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 |
1223 mul v14.4h, v14.4h, v28.4h | 1225 mul v14.4h, v14.4h, v28.4h |
1224 mul v15.4h, v15.4h, v29.4h | 1226 mul v15.4h, v15.4h, v29.4h |
1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ | 1227 ins v14.d[1], v15.d[0] /* 128 bit q14 */ |
1226 mul v16.4h, v16.4h, v30.4h | 1228 mul v16.4h, v16.4h, v30.4h |
1227 mul v17.4h, v17.4h, v31.4h | 1229 mul v17.4h, v17.4h, v31.4h |
1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ | 1230 ins v16.d[1], v17.d[0] /* 128 bit q16 */ |
1229 | 1231 |
1230 /* Pass 1 */ | 1232 /* Pass 1 */ |
1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h | 1233 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h |
1232 transpose_4x4 v4, v6, v8, v10, v3 | 1234 transpose_4x4 v4, v6, v8, v10, v3 |
1233 ins v10.2d[1], v11.2d[0] | 1235 ins v10.d[1], v11.d[0] |
1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h | 1236 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h |
1235 transpose_4x4 v5, v7, v9, v11, v3 | 1237 transpose_4x4 v5, v7, v9, v11, v3 |
1236 ins v10.2d[1], v11.2d[0] | 1238 ins v10.d[1], v11.d[0] |
1237 /* Pass 2 */ | 1239 /* Pass 2 */ |
1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h | 1240 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h |
1239 transpose_4x4 v26, v27, v28, v29, v3 | 1241 transpose_4x4 v26, v27, v28, v29, v3 |
1240 | 1242 |
1241 /* Range limit */ | 1243 /* Range limit */ |
1242 movi v30.8h, #0x80 | 1244 movi v30.8h, #0x80 |
1243 ins v26.2d[1], v27.2d[0] | 1245 ins v26.d[1], v27.d[0] |
1244 ins v28.2d[1], v29.2d[0] | 1246 ins v28.d[1], v29.d[0] |
1245 add v26.8h, v26.8h, v30.8h | 1247 add v26.8h, v26.8h, v30.8h |
1246 add v28.8h, v28.8h, v30.8h | 1248 add v28.8h, v28.8h, v30.8h |
1247 sqxtun v26.8b, v26.8h | 1249 sqxtun v26.8b, v26.8h |
1248 sqxtun v27.8b, v28.8h | 1250 sqxtun v27.8b, v28.8h |
1249 | 1251 |
1250 /* Store results to the output buffer */ | 1252 /* Store results to the output buffer */ |
1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1253 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
1252 ldp TMP3, TMP4, [OUTPUT_BUF] | 1254 ldp TMP3, TMP4, [OUTPUT_BUF] |
1253 add TMP1, TMP1, OUTPUT_COL | 1255 add TMP1, TMP1, OUTPUT_COL |
1254 add TMP2, TMP2, OUTPUT_COL | 1256 add TMP2, TMP2, OUTPUT_COL |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1326 | 1328 |
1327 .balign 8 | 1329 .balign 8 |
1328 jsimd_idct_2x2_neon_consts: | 1330 jsimd_idct_2x2_neon_consts: |
1329 .short -FIX_0_720959822 /* v14[0] */ | 1331 .short -FIX_0_720959822 /* v14[0] */ |
1330 .short FIX_0_850430095 /* v14[1] */ | 1332 .short FIX_0_850430095 /* v14[1] */ |
1331 .short -FIX_1_272758580 /* v14[2] */ | 1333 .short -FIX_1_272758580 /* v14[2] */ |
1332 .short FIX_3_624509785 /* v14[3] */ | 1334 .short FIX_3_624509785 /* v14[3] */ |
1333 | 1335 |
1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | 1336 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
1335 sshll v15.4s, \x4, #15 | 1337 sshll v15.4s, \x4, #15 |
1336 smull v26.4s, \x6, v14.4h[3] | 1338 smull v26.4s, \x6, v14.h[3] |
1337 smlal v26.4s, \x10, v14.4h[2] | 1339 smlal v26.4s, \x10, v14.h[2] |
1338 smlal v26.4s, \x12, v14.4h[1] | 1340 smlal v26.4s, \x12, v14.h[1] |
1339 smlal v26.4s, \x16, v14.4h[0] | 1341 smlal v26.4s, \x16, v14.h[0] |
1340 | 1342 |
1341 add v20.4s, v15.4s, v26.4s | 1343 add v20.4s, v15.4s, v26.4s |
1342 sub v15.4s, v15.4s, v26.4s | 1344 sub v15.4s, v15.4s, v26.4s |
1343 | 1345 |
1344 .if \shift > 16 | 1346 .if \shift > 16 |
1345 srshr v20.4s, v20.4s, #\shift | 1347 srshr v20.4s, v20.4s, #\shift |
1346 srshr v15.4s, v15.4s, #\shift | 1348 srshr v15.4s, v15.4s, #\shift |
1347 xtn \y26, v20.4s | 1349 xtn \y26, v20.4s |
1348 xtn \y27, v15.4s | 1350 xtn \y27, v15.4s |
1349 .else | 1351 .else |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1393 add COEF_BLOCK, COEF_BLOCK, #16 | 1395 add COEF_BLOCK, COEF_BLOCK, #16 |
1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 | 1396 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 |
1395 add COEF_BLOCK, COEF_BLOCK, #16 | 1397 add COEF_BLOCK, COEF_BLOCK, #16 |
1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 | 1398 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 |
1397 add COEF_BLOCK, COEF_BLOCK, #16 | 1399 add COEF_BLOCK, COEF_BLOCK, #16 |
1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | 1400 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 |
1399 /* Dequantize */ | 1401 /* Dequantize */ |
1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | 1402 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 |
1401 mul v4.4h, v4.4h, v18.4h | 1403 mul v4.4h, v4.4h, v18.4h |
1402 mul v5.4h, v5.4h, v19.4h | 1404 mul v5.4h, v5.4h, v19.4h |
1403 ins v4.2d[1], v5.2d[0] | 1405 ins v4.d[1], v5.d[0] |
1404 mul v6.4h, v6.4h, v20.4h | 1406 mul v6.4h, v6.4h, v20.4h |
1405 mul v7.4h, v7.4h, v21.4h | 1407 mul v7.4h, v7.4h, v21.4h |
1406 ins v6.2d[1], v7.2d[0] | 1408 ins v6.d[1], v7.d[0] |
1407 add DCT_TABLE, DCT_TABLE, #16 | 1409 add DCT_TABLE, DCT_TABLE, #16 |
1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 | 1410 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 |
1409 mul v10.4h, v10.4h, v24.4h | 1411 mul v10.4h, v10.4h, v24.4h |
1410 mul v11.4h, v11.4h, v25.4h | 1412 mul v11.4h, v11.4h, v25.4h |
1411 ins v10.2d[1], v11.2d[0] | 1413 ins v10.d[1], v11.d[0] |
1412 add DCT_TABLE, DCT_TABLE, #16 | 1414 add DCT_TABLE, DCT_TABLE, #16 |
1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 | 1415 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 |
1414 mul v12.4h, v12.4h, v26.4h | 1416 mul v12.4h, v12.4h, v26.4h |
1415 mul v13.4h, v13.4h, v27.4h | 1417 mul v13.4h, v13.4h, v27.4h |
1416 ins v12.2d[1], v13.2d[0] | 1418 ins v12.d[1], v13.d[0] |
1417 add DCT_TABLE, DCT_TABLE, #16 | 1419 add DCT_TABLE, DCT_TABLE, #16 |
1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | 1420 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 |
1419 mul v16.4h, v16.4h, v30.4h | 1421 mul v16.4h, v16.4h, v30.4h |
1420 mul v17.4h, v17.4h, v31.4h | 1422 mul v17.4h, v17.4h, v31.4h |
1421 ins v16.2d[1], v17.2d[0] | 1423 ins v16.d[1], v17.d[0] |
1422 | 1424 |
1423 /* Pass 1 */ | 1425 /* Pass 1 */ |
1424 #if 0 | 1426 #if 0 |
1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h | 1427 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h |
1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h | 1428 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h |
1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h | 1429 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h |
1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h | 1430 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h |
1429 #else | 1431 #else |
1430 smull v26.4s, v6.4h, v14.4h[3] | 1432 smull v26.4s, v6.4h, v14.h[3] |
1431 smlal v26.4s, v10.4h, v14.4h[2] | 1433 smlal v26.4s, v10.4h, v14.h[2] |
1432 smlal v26.4s, v12.4h, v14.4h[1] | 1434 smlal v26.4s, v12.4h, v14.h[1] |
1433 smlal v26.4s, v16.4h, v14.4h[0] | 1435 smlal v26.4s, v16.4h, v14.h[0] |
1434 smull v24.4s, v7.4h, v14.4h[3] | 1436 smull v24.4s, v7.4h, v14.h[3] |
1435 smlal v24.4s, v11.4h, v14.4h[2] | 1437 smlal v24.4s, v11.4h, v14.h[2] |
1436 smlal v24.4s, v13.4h, v14.4h[1] | 1438 smlal v24.4s, v13.4h, v14.h[1] |
1437 smlal v24.4s, v17.4h, v14.4h[0] | 1439 smlal v24.4s, v17.4h, v14.h[0] |
1438 sshll v15.4s, v4.4h, #15 | 1440 sshll v15.4s, v4.4h, #15 |
1439 sshll v30.4s, v5.4h, #15 | 1441 sshll v30.4s, v5.4h, #15 |
1440 add v20.4s, v15.4s, v26.4s | 1442 add v20.4s, v15.4s, v26.4s |
1441 sub v15.4s, v15.4s, v26.4s | 1443 sub v15.4s, v15.4s, v26.4s |
1442 rshrn v4.4h, v20.4s, #13 | 1444 rshrn v4.4h, v20.4s, #13 |
1443 rshrn v6.4h, v15.4s, #13 | 1445 rshrn v6.4h, v15.4s, #13 |
1444 add v20.4s, v30.4s, v24.4s | 1446 add v20.4s, v30.4s, v24.4s |
1445 sub v15.4s, v30.4s, v24.4s | 1447 sub v15.4s, v30.4s, v24.4s |
1446 rshrn v5.4h, v20.4s, #13 | 1448 rshrn v5.4h, v20.4s, #13 |
1447 rshrn v7.4h, v15.4s, #13 | 1449 rshrn v7.4h, v15.4s, #13 |
1448 ins v4.2d[1], v5.2d[0] | 1450 ins v4.d[1], v5.d[0] |
1449 ins v6.2d[1], v7.2d[0] | 1451 ins v6.d[1], v7.d[0] |
1450 transpose v4, v6, v3, .16b, .8h | 1452 transpose v4, v6, v3, .16b, .8h |
1451 transpose v6, v10, v3, .16b, .4s | 1453 transpose v6, v10, v3, .16b, .4s |
1452 ins v11.2d[0], v10.2d[1] | 1454 ins v11.d[0], v10.d[1] |
1453 ins v7.2d[0], v6.2d[1] | 1455 ins v7.d[0], v6.d[1] |
1454 #endif | 1456 #endif |
1455 | 1457 |
1456 /* Pass 2 */ | 1458 /* Pass 2 */ |
1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h | 1459 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h |
1458 | 1460 |
1459 /* Range limit */ | 1461 /* Range limit */ |
1460 movi v30.8h, #0x80 | 1462 movi v30.8h, #0x80 |
1461 ins v26.2d[1], v27.2d[0] | 1463 ins v26.d[1], v27.d[0] |
1462 add v26.8h, v26.8h, v30.8h | 1464 add v26.8h, v26.8h, v30.8h |
1463 sqxtun v30.8b, v26.8h | 1465 sqxtun v30.8b, v26.8h |
1464 ins v26.2d[0], v30.2d[0] | 1466 ins v26.d[0], v30.d[0] |
1465 sqxtun v27.8b, v26.8h | 1467 sqxtun v27.8b, v26.8h |
1466 | 1468 |
1467 /* Store results to the output buffer */ | 1469 /* Store results to the output buffer */ |
1468 ldp TMP1, TMP2, [OUTPUT_BUF] | 1470 ldp TMP1, TMP2, [OUTPUT_BUF] |
1469 add TMP1, TMP1, OUTPUT_COL | 1471 add TMP1, TMP1, OUTPUT_COL |
1470 add TMP2, TMP2, OUTPUT_COL | 1472 add TMP2, TMP2, OUTPUT_COL |
1471 | 1473 |
1472 st1 {v26.b}[0], [TMP1], 1 | 1474 st1 {v26.b}[0], [TMP1], 1 |
1473 st1 {v27.b}[4], [TMP1], 1 | 1475 st1 {v27.b}[4], [TMP1], 1 |
1474 st1 {v26.b}[1], [TMP2], 1 | 1476 st1 {v26.b}[1], [TMP2], 1 |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1597 .endif | 1599 .endif |
1598 .endm | 1600 .endm |
1599 | 1601 |
1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize | 1602 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize |
1601 | 1603 |
1602 /* | 1604 /* |
1603 * 2-stage pipelined YCbCr->RGB conversion | 1605 * 2-stage pipelined YCbCr->RGB conversion |
1604 */ | 1606 */ |
1605 | 1607 |
1606 .macro do_yuv_to_rgb_stage1 | 1608 .macro do_yuv_to_rgb_stage1 |
1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ | 1609 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ |
1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1610 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1611 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1612 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1613 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1614 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1615 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1616 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | 1617 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ |
1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | 1618 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ |
1617 .endm | 1619 .endm |
1618 | 1620 |
1619 .macro do_yuv_to_rgb_stage2 | 1621 .macro do_yuv_to_rgb_stage2 |
1620 rshrn v20.4h, v20.4s, #15 | 1622 rshrn v20.4h, v20.4s, #15 |
1621 rshrn2 v20.8h, v22.4s, #15 | 1623 rshrn2 v20.8h, v22.4s, #15 |
1622 rshrn v24.4h, v24.4s, #14 | 1624 rshrn v24.4h, v24.4s, #14 |
1623 rshrn2 v24.8h, v26.4s, #14 | 1625 rshrn2 v24.8h, v26.4s, #14 |
1624 rshrn v28.4h, v28.4s, #14 | 1626 rshrn v28.4h, v28.4s, #14 |
1625 rshrn2 v28.8h, v30.4s, #14 | 1627 rshrn2 v28.8h, v30.4s, #14 |
1626 uaddw v20.8h, v20.8h, v0.8b | 1628 uaddw v20.8h, v20.8h, v0.8b |
(...skipping 26 matching lines...) Expand all Loading... | |
1653 uaddw v24.8h, v24.8h, v0.8b | 1655 uaddw v24.8h, v24.8h, v0.8b |
1654 uaddw v28.8h, v28.8h, v0.8b | 1656 uaddw v28.8h, v28.8h, v0.8b |
1655 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ | 1657 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ |
1656 sqxtun v1\g_offs\defsize, v20.8h | 1658 sqxtun v1\g_offs\defsize, v20.8h |
1657 ld1 {v0.8b}, [Y], 8 | 1659 ld1 {v0.8b}, [Y], 8 |
1658 sqxtun v1\r_offs\defsize, v24.8h | 1660 sqxtun v1\r_offs\defsize, v24.8h |
1659 prfm PLDL1KEEP, [U, #64] | 1661 prfm PLDL1KEEP, [U, #64] |
1660 prfm PLDL1KEEP, [V, #64] | 1662 prfm PLDL1KEEP, [V, #64] |
1661 prfm PLDL1KEEP, [Y, #64] | 1663 prfm PLDL1KEEP, [Y, #64] |
1662 sqxtun v1\b_offs\defsize, v28.8h | 1664 sqxtun v1\b_offs\defsize, v28.8h |
1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | 1665 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ |
1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1666 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1667 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1668 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1669 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1670 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1671 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1672 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
1671 .else /**************************** rgb565 ***********************************/ | 1673 .else /**************************** rgb565 ***********************************/ |
1672 sqshlu v21.8h, v20.8h, #8 | 1674 sqshlu v21.8h, v20.8h, #8 |
1673 sqshlu v25.8h, v24.8h, #8 | 1675 sqshlu v25.8h, v24.8h, #8 |
1674 sqshlu v29.8h, v28.8h, #8 | 1676 sqshlu v29.8h, v28.8h, #8 |
1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | 1677 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ |
1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1678 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
1677 ld1 {v0.8b}, [Y], 8 | 1679 ld1 {v0.8b}, [Y], 8 |
1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1680 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1681 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1682 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1683 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
1682 sri v25.8h, v21.8h, #5 | 1684 sri v25.8h, v21.8h, #5 |
1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1685 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1686 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
1685 prfm PLDL1KEEP, [U, #64] | 1687 prfm PLDL1KEEP, [U, #64] |
1686 prfm PLDL1KEEP, [V, #64] | 1688 prfm PLDL1KEEP, [V, #64] |
1687 prfm PLDL1KEEP, [Y, #64] | 1689 prfm PLDL1KEEP, [Y, #64] |
1688 sri v25.8h, v29.8h, #11 | 1690 sri v25.8h, v29.8h, #11 |
1689 .endif | 1691 .endif |
1690 do_store \bpp, 8 | 1692 do_store \bpp, 8 |
1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | 1693 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ |
1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | 1694 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ |
1693 .endm | 1695 .endm |
1694 | 1696 |
1695 .macro do_yuv_to_rgb | 1697 .macro do_yuv_to_rgb |
1696 do_yuv_to_rgb_stage1 | 1698 do_yuv_to_rgb_stage1 |
1697 do_yuv_to_rgb_stage2 | 1699 do_yuv_to_rgb_stage2 |
1698 .endm | 1700 .endm |
1699 | 1701 |
1700 /* Apple gas crashes on adrl, work around that by using adr. | 1702 /* Apple gas crashes on adrl, work around that by using adr. |
1701 * But this requires a copy of these constants for each function. | 1703 * But this requires a copy of these constants for each function. |
1702 */ | 1704 */ |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1852 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */ | 1854 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */ |
1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b | 1855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b |
1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b | 1856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b |
1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b | 1857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b |
1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b | 1858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b |
1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b | 1859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b |
1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b | 1860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b |
1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b | 1861 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b |
1860 .purgem do_load | 1862 .purgem do_load |
1861 .purgem do_store | 1863 .purgem do_store |
OLD | NEW |