Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(823)

Side by Side Diff: simd/jsimd_arm64_neon.S

Issue 1885373002: Fix simd access for 64bit ARM neon clang compilation (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * ARMv8 NEON optimizations for libjpeg-turbo 2 * ARMv8 NEON optimizations for libjpeg-turbo
3 * 3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved. 5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited 7 * Copyright (C) 2013-2014, Linaro Limited
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * 9 *
10 * This software is provided 'as-is', without any express or implied 10 * This software is provided 'as-is', without any express or implied
(...skipping 11 matching lines...) Expand all
22 * 2. Altered source versions must be plainly marked as such, and must not be 22 * 2. Altered source versions must be plainly marked as such, and must not be
23 * misrepresented as being the original software. 23 * misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution. 24 * 3. This notice may not be removed or altered from any source distribution.
25 */ 25 */
26 26
27 #if defined(__linux__) && defined(__ELF__) 27 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
29 #endif 29 #endif
30 30
31 .text 31 .text
32 #ifndef __clang__
32 .arch armv8-a+fp+simd 33 .arch armv8-a+fp+simd
Nico 2016/04/20 20:53:30 again, can we please fix this in clang instead?
34 #endif
33 35
34 36
35 #define RESPECT_STRICT_ALIGNMENT 1 37 #define RESPECT_STRICT_ALIGNMENT 1
36 38
37 39
38 /*****************************************************************************/ 40 /*****************************************************************************/
39 41
40 /* Supplementary macro for setting function attributes */ 42 /* Supplementary macro for setting function attributes */
41 .macro asm_function fname 43 .macro asm_function fname
42 #ifdef __APPLE__ 44 #ifdef __APPLE__
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after
190 tmp0 = q4; \ 192 tmp0 = q4; \
191 tmp1 = q5; \ 193 tmp1 = q5; \
192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 194 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
193 tmp3 = q7; \ 195 tmp3 = q7; \
194 tmp10 = q2; \ 196 tmp10 = q2; \
195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 197 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
196 tmp12 = q3; \ 198 tmp12 = q3; \
197 tmp13 = q1; \ 199 tmp13 = q1; \
198 } 200 }
199 201
200 #define XFIX_0_899976223 v0.4h[0] 202 #define XFIX_0_899976223 v0.h[0]
201 #define XFIX_0_541196100 v0.4h[1] 203 #define XFIX_0_541196100 v0.h[1]
202 #define XFIX_2_562915447 v0.4h[2] 204 #define XFIX_2_562915447 v0.h[2]
203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] 205 #define XFIX_0_298631336_MINUS_0_899976223 v0.h[3]
204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] 206 #define XFIX_1_501321110_MINUS_0_899976223 v1.h[0]
205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] 207 #define XFIX_2_053119869_MINUS_2_562915447 v1.h[1]
206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] 208 #define XFIX_0_541196100_PLUS_0_765366865 v1.h[2]
207 #define XFIX_1_175875602 v1.4h[3] 209 #define XFIX_1_175875602 v1.h[3]
208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] 210 #define XFIX_1_175875602_MINUS_0_390180644 v2.h[0]
209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] 211 #define XFIX_0_541196100_MINUS_1_847759065 v2.h[1]
210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] 212 #define XFIX_3_072711026_MINUS_2_562915447 v2.h[2]
211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] 213 #define XFIX_1_175875602_MINUS_1_961570560 v2.h[3]
212 214
213 .balign 16 215 .balign 16
214 jsimd_idct_islow_neon_consts: 216 jsimd_idct_islow_neon_consts:
215 .short FIX_0_899976223 /* d0[0] */ 217 .short FIX_0_899976223 /* d0[0] */
216 .short FIX_0_541196100 /* d0[1] */ 218 .short FIX_0_541196100 /* d0[1] */
217 .short FIX_2_562915447 /* d0[2] */ 219 .short FIX_2_562915447 /* d0[2] */
218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 220 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 221 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 222 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 223 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
263 st1 {v12.8b - v15.8b}, [sp], 32 265 st1 {v12.8b - v15.8b}, [sp], 32
264 st1 {v16.8b - v19.8b}, [sp], 32 266 st1 {v16.8b - v19.8b}, [sp], 32
265 st1 {v20.8b - v23.8b}, [sp], 32 267 st1 {v20.8b - v23.8b}, [sp], 32
266 st1 {v24.8b - v27.8b}, [sp], 32 268 st1 {v24.8b - v27.8b}, [sp], 32
267 st1 {v28.8b - v31.8b}, [sp], 32 269 st1 {v28.8b - v31.8b}, [sp], 32
268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 270 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 271 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 272 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
271 mul v16.4h, v16.4h, v0.4h 273 mul v16.4h, v16.4h, v0.4h
272 mul v17.4h, v17.4h, v1.4h 274 mul v17.4h, v17.4h, v1.4h
273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ 275 ins v16.d[1], v17.d[0] /* 128 bit q8 */
274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 276 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
275 mul v18.4h, v18.4h, v2.4h 277 mul v18.4h, v18.4h, v2.4h
276 mul v19.4h, v19.4h, v3.4h 278 mul v19.4h, v19.4h, v3.4h
277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ 279 ins v18.d[1], v19.d[0] /* 128 bit q9 */
278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 280 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
279 mul v20.4h, v20.4h, v4.4h 281 mul v20.4h, v20.4h, v4.4h
280 mul v21.4h, v21.4h, v5.4h 282 mul v21.4h, v21.4h, v5.4h
281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ 283 ins v20.d[1], v21.d[0] /* 128 bit q10 */
282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 284 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
283 mul v22.4h, v22.4h, v6.4h 285 mul v22.4h, v22.4h, v6.4h
284 mul v23.4h, v23.4h, v7.4h 286 mul v23.4h, v23.4h, v7.4h
285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ 287 ins v22.d[1], v23.d[0] /* 128 bit q11 */
286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] 288 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
287 mul v24.4h, v24.4h, v0.4h 289 mul v24.4h, v24.4h, v0.4h
288 mul v25.4h, v25.4h, v1.4h 290 mul v25.4h, v25.4h, v1.4h
289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ 291 ins v24.d[1], v25.d[0] /* 128 bit q12 */
290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 292 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
291 mul v28.4h, v28.4h, v4.4h 293 mul v28.4h, v28.4h, v4.4h
292 mul v29.4h, v29.4h, v5.4h 294 mul v29.4h, v29.4h, v5.4h
293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ 295 ins v28.d[1], v29.d[0] /* 128 bit q14 */
294 mul v26.4h, v26.4h, v2.4h 296 mul v26.4h, v26.4h, v2.4h
295 mul v27.4h, v27.4h, v3.4h 297 mul v27.4h, v27.4h, v3.4h
296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ 298 ins v26.d[1], v27.d[0] /* 128 bit q13 */
297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ 299 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
298 add x15, x15, #16 300 add x15, x15, #16
299 mul v30.4h, v30.4h, v6.4h 301 mul v30.4h, v30.4h, v6.4h
300 mul v31.4h, v31.4h, v7.4h 302 mul v31.4h, v31.4h, v7.4h
301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ 303 ins v30.d[1], v31.d[0] /* 128 bit q15 */
302 /* Go to the bottom of the stack */ 304 /* Go to the bottom of the stack */
303 sub sp, sp, 352 305 sub sp, sp, 352
304 stp x4, x5, [sp], 16 306 stp x4, x5, [sp], 16
305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ 307 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
306 st1 {v12.4h - v15.4h}, [sp], 32 308 st1 {v12.4h - v15.4h}, [sp], 32
307 /* 1-D IDCT, pass 1, left 4x8 half */ 309 /* 1-D IDCT, pass 1, left 4x8 half */
308 add v4.4h, ROW7L.4h, ROW3L.4h 310 add v4.4h, ROW7L.4h, ROW3L.4h
309 add v5.4h, ROW5L.4h, ROW1L.4h 311 add v5.4h, ROW5L.4h, ROW1L.4h
310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 312 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
311 smlal v12.4s, v5.4h, XFIX_1_175875602 313 smlal v12.4s, v5.4h, XFIX_1_175875602
(...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after
546 add v12.4s, v4.4s, v14.4s 548 add v12.4s, v4.4s, v14.4s
547 sub v4.4s, v4.4s, v14.4s 549 sub v4.4s, v4.4s, v14.4s
548 add v10.4s, v2.4s, v8.4s 550 add v10.4s, v2.4s, v8.4s
549 sub v6.4s, v2.4s, v8.4s 551 sub v6.4s, v2.4s, v8.4s
550 shrn ROW7R.4h, v4.4s, #16 552 shrn ROW7R.4h, v4.4s, #16
551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 553 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 554 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
553 shrn ROW4R.4h, v6.4s, #16 555 shrn ROW4R.4h, v6.4s, #16
554 556
555 2: /* Descale to 8-bit and range limit */ 557 2: /* Descale to 8-bit and range limit */
556 ins v16.2d[1], v17.2d[0] 558 ins v16.d[1], v17.d[0]
557 ins v18.2d[1], v19.2d[0] 559 ins v18.d[1], v19.d[0]
558 ins v20.2d[1], v21.2d[0] 560 ins v20.d[1], v21.d[0]
559 ins v22.2d[1], v23.2d[0] 561 ins v22.d[1], v23.d[0]
560 sqrshrn v16.8b, v16.8h, #2 562 sqrshrn v16.8b, v16.8h, #2
561 sqrshrn2 v16.16b, v18.8h, #2 563 sqrshrn2 v16.16b, v18.8h, #2
562 sqrshrn v18.8b, v20.8h, #2 564 sqrshrn v18.8b, v20.8h, #2
563 sqrshrn2 v18.16b, v22.8h, #2 565 sqrshrn2 v18.16b, v22.8h, #2
564 566
565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ 567 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
566 ld1 {v8.4h - v11.4h}, [sp], 32 568 ld1 {v8.4h - v11.4h}, [sp], 32
567 ld1 {v12.4h - v15.4h}, [sp], 32 569 ld1 {v12.4h - v15.4h}, [sp], 32
568 ins v24.2d[1], v25.2d[0] 570 ins v24.d[1], v25.d[0]
569 571
570 sqrshrn v20.8b, v24.8h, #2 572 sqrshrn v20.8b, v24.8h, #2
571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 573 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
572 /* trn1 v16.8h, v16.8h, v18.8h */ 574 /* trn1 v16.8h, v16.8h, v18.8h */
573 transpose v16, v18, v3, .16b, .8h 575 transpose v16, v18, v3, .16b, .8h
574 ins v26.2d[1], v27.2d[0] 576 ins v26.d[1], v27.d[0]
575 ins v28.2d[1], v29.2d[0] 577 ins v28.d[1], v29.d[0]
576 ins v30.2d[1], v31.2d[0] 578 ins v30.d[1], v31.d[0]
577 sqrshrn2 v20.16b, v26.8h, #2 579 sqrshrn2 v20.16b, v26.8h, #2
578 sqrshrn v22.8b, v28.8h, #2 580 sqrshrn v22.8b, v28.8h, #2
579 movi v0.16b, #(CENTERJSAMPLE) 581 movi v0.16b, #(CENTERJSAMPLE)
580 sqrshrn2 v22.16b, v30.8h, #2 582 sqrshrn2 v22.16b, v30.8h, #2
581 transpose_single v16, v17, v3, .2d, .8b 583 transpose_single v16, v17, v3, .d, .8b
582 transpose_single v18, v19, v3, .2d, .8b 584 transpose_single v18, v19, v3, .d, .8b
583 add v16.8b, v16.8b, v0.8b 585 add v16.8b, v16.8b, v0.8b
584 add v17.8b, v17.8b, v0.8b 586 add v17.8b, v17.8b, v0.8b
585 add v18.8b, v18.8b, v0.8b 587 add v18.8b, v18.8b, v0.8b
586 add v19.8b, v19.8b, v0.8b 588 add v19.8b, v19.8b, v0.8b
587 transpose v20, v22, v3, .16b, .8h 589 transpose v20, v22, v3, .16b, .8h
588 /* Store results to the output buffer */ 590 /* Store results to the output buffer */
589 ldp TMP1, TMP2, [OUTPUT_BUF], 16 591 ldp TMP1, TMP2, [OUTPUT_BUF], 16
590 add TMP1, TMP1, OUTPUT_COL 592 add TMP1, TMP1, OUTPUT_COL
591 add TMP2, TMP2, OUTPUT_COL 593 add TMP2, TMP2, OUTPUT_COL
592 st1 {v16.8b}, [TMP1] 594 st1 {v16.8b}, [TMP1]
593 transpose_single v20, v21, v3, .2d, .8b 595 transpose_single v20, v21, v3, .d, .8b
594 st1 {v17.8b}, [TMP2] 596 st1 {v17.8b}, [TMP2]
595 ldp TMP1, TMP2, [OUTPUT_BUF], 16 597 ldp TMP1, TMP2, [OUTPUT_BUF], 16
596 add TMP1, TMP1, OUTPUT_COL 598 add TMP1, TMP1, OUTPUT_COL
597 add TMP2, TMP2, OUTPUT_COL 599 add TMP2, TMP2, OUTPUT_COL
598 st1 {v18.8b}, [TMP1] 600 st1 {v18.8b}, [TMP1]
599 add v20.8b, v20.8b, v0.8b 601 add v20.8b, v20.8b, v0.8b
600 add v21.8b, v21.8b, v0.8b 602 add v21.8b, v21.8b, v0.8b
601 st1 {v19.8b}, [TMP2] 603 st1 {v19.8b}, [TMP2]
602 ldp TMP1, TMP2, [OUTPUT_BUF], 16 604 ldp TMP1, TMP2, [OUTPUT_BUF], 16
603 ldp TMP3, TMP4, [OUTPUT_BUF] 605 ldp TMP3, TMP4, [OUTPUT_BUF]
604 add TMP1, TMP1, OUTPUT_COL 606 add TMP1, TMP1, OUTPUT_COL
605 add TMP2, TMP2, OUTPUT_COL 607 add TMP2, TMP2, OUTPUT_COL
606 add TMP3, TMP3, OUTPUT_COL 608 add TMP3, TMP3, OUTPUT_COL
607 add TMP4, TMP4, OUTPUT_COL 609 add TMP4, TMP4, OUTPUT_COL
608 transpose_single v22, v23, v3, .2d, .8b 610 transpose_single v22, v23, v3, .d, .8b
609 st1 {v20.8b}, [TMP1] 611 st1 {v20.8b}, [TMP1]
610 add v22.8b, v22.8b, v0.8b 612 add v22.8b, v22.8b, v0.8b
611 add v23.8b, v23.8b, v0.8b 613 add v23.8b, v23.8b, v0.8b
612 st1 {v21.8b}, [TMP2] 614 st1 {v21.8b}, [TMP2]
613 st1 {v22.8b}, [TMP3] 615 st1 {v22.8b}, [TMP3]
614 st1 {v23.8b}, [TMP4] 616 st1 {v23.8b}, [TMP4]
615 ldr x15, [sp], 16 617 ldr x15, [sp], 16
616 ld1 {v0.8b - v3.8b}, [sp], 32 618 ld1 {v0.8b - v3.8b}, [sp], 32
617 ld1 {v4.8b - v7.8b}, [sp], 32 619 ld1 {v4.8b - v7.8b}, [sp], 32
618 ld1 {v8.8b - v11.8b}, [sp], 32 620 ld1 {v8.8b - v11.8b}, [sp], 32
(...skipping 13 matching lines...) Expand all
632 transpose ROW4L, ROW5L, v3, .16b, .4h 634 transpose ROW4L, ROW5L, v3, .16b, .4h
633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ 635 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
634 transpose ROW1L, ROW3L, v3, .16b, .2s 636 transpose ROW1L, ROW3L, v3, .16b, .2s
635 transpose ROW4L, ROW6L, v3, .16b, .2s 637 transpose ROW4L, ROW6L, v3, .16b, .2s
636 transpose ROW0L, ROW2L, v3, .16b, .2s 638 transpose ROW0L, ROW2L, v3, .16b, .2s
637 transpose ROW5L, ROW7L, v3, .16b, .2s 639 transpose ROW5L, ROW7L, v3, .16b, .2s
638 cmp x0, #0 640 cmp x0, #0
639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */ 641 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */
640 642
641 /* Only row 0 is non-zero for the right 4x8 half */ 643 /* Only row 0 is non-zero for the right 4x8 half */
642 dup ROW1R.4h, ROW0R.4h[1] 644 dup ROW1R.4h, ROW0R.h[1]
643 dup ROW2R.4h, ROW0R.4h[2] 645 dup ROW2R.4h, ROW0R.h[2]
644 dup ROW3R.4h, ROW0R.4h[3] 646 dup ROW3R.4h, ROW0R.h[3]
645 dup ROW4R.4h, ROW0R.4h[0] 647 dup ROW4R.4h, ROW0R.h[0]
646 dup ROW5R.4h, ROW0R.4h[1] 648 dup ROW5R.4h, ROW0R.h[1]
647 dup ROW6R.4h, ROW0R.4h[2] 649 dup ROW6R.4h, ROW0R.h[2]
648 dup ROW7R.4h, ROW0R.4h[3] 650 dup ROW7R.4h, ROW0R.h[3]
649 dup ROW0R.4h, ROW0R.4h[0] 651 dup ROW0R.4h, ROW0R.h[0]
650 b 1b /* Go to 'normal' second pass */ 652 b 1b /* Go to 'normal' second pass */
651 653
652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 654 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
653 ld1 {v2.4h}, [x15] /* reload constants */ 655 ld1 {v2.4h}, [x15] /* reload constants */
654 smull v12.4s, ROW1L.4h, XFIX_1_175875602 656 smull v12.4s, ROW1L.4h, XFIX_1_175875602
655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 657 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
656 smull v14.4s, ROW3L.4h, XFIX_1_175875602 658 smull v14.4s, ROW3L.4h, XFIX_1_175875602
657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 659 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
658 smull v4.4s, ROW2L.4h, XFIX_0_541196100 660 smull v4.4s, ROW2L.4h, XFIX_0_541196100
659 sshll v6.4s, ROW0L.4h, #13 661 sshll v6.4s, ROW0L.4h, #13
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after
763 * function from jidctfst.c 765 * function from jidctfst.c
764 * 766 *
765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 767 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
766 * But in ARM NEON case some extra additions are required because VQDMULH 768 * But in ARM NEON case some extra additions are required because VQDMULH
767 * instruction can't handle the constants larger than 1. So the expressions 769 * instruction can't handle the constants larger than 1. So the expressions
768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 770 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
769 * which introduces an extra addition. Overall, there are 6 extra additions 771 * which introduces an extra addition. Overall, there are 6 extra additions
770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 772 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
771 */ 773 */
772 774
773 #define XFIX_1_082392200 v0.4h[0] 775 #define XFIX_1_082392200 v0.h[0]
774 #define XFIX_1_414213562 v0.4h[1] 776 #define XFIX_1_414213562 v0.h[1]
775 #define XFIX_1_847759065 v0.4h[2] 777 #define XFIX_1_847759065 v0.h[2]
776 #define XFIX_2_613125930 v0.4h[3] 778 #define XFIX_2_613125930 v0.h[3]
777 779
778 .balign 16 780 .balign 16
779 jsimd_idct_ifast_neon_consts: 781 jsimd_idct_ifast_neon_consts:
780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 782 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 783 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 784 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 785 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
784 786
785 asm_function jsimd_idct_ifast_neon 787 asm_function jsimd_idct_ifast_neon
786 788
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 /* Transpose q8-q10 */ 904 /* Transpose q8-q10 */
903 mov v18.16b, v8.16b 905 mov v18.16b, v8.16b
904 trn1 v8.4s, v8.4s, v10.4s 906 trn1 v8.4s, v8.4s, v10.4s
905 trn2 v10.4s, v18.4s, v10.4s 907 trn2 v10.4s, v18.4s, v10.4s
906 /* Transpose q13-q15 */ 908 /* Transpose q13-q15 */
907 mov v18.16b, v13.16b 909 mov v18.16b, v13.16b
908 trn1 v13.4s, v13.4s, v15.4s 910 trn1 v13.4s, v13.4s, v15.4s
909 trn2 v15.4s, v18.4s, v15.4s 911 trn2 v15.4s, v18.4s, v15.4s
910 /* vswp v14.4h, v10-MSB.4h */ 912 /* vswp v14.4h, v10-MSB.4h */
911 umov x22, v14.d[0] 913 umov x22, v14.d[0]
912 ins v14.2d[0], v10.2d[1] 914 ins v14.d[0], v10.d[1]
913 ins v10.2d[1], x22 915 ins v10.d[1], x22
914 /* vswp v13.4h, v9MSB.4h */ 916 /* vswp v13.4h, v9MSB.4h */
915 917
916 umov x22, v13.d[0] 918 umov x22, v13.d[0]
917 ins v13.2d[0], v9.2d[1] 919 ins v13.d[0], v9.d[1]
918 ins v9.2d[1], x22 920 ins v9.d[1], x22
919 /* 1-D IDCT, pass 2 */ 921 /* 1-D IDCT, pass 2 */
920 sub v2.8h, v10.8h, v14.8h 922 sub v2.8h, v10.8h, v14.8h
921 /* vswp v15.4h, v11MSB.4h */ 923 /* vswp v15.4h, v11MSB.4h */
922 umov x22, v15.d[0] 924 umov x22, v15.d[0]
923 ins v15.2d[0], v11.2d[1] 925 ins v15.d[0], v11.d[1]
924 ins v11.2d[1], x22 926 ins v11.d[1], x22
925 add v14.8h, v10.8h, v14.8h 927 add v14.8h, v10.8h, v14.8h
926 /* vswp v12.4h, v8-MSB.4h */ 928 /* vswp v12.4h, v8-MSB.4h */
927 umov x22, v12.d[0] 929 umov x22, v12.d[0]
928 ins v12.2d[0], v8.2d[1] 930 ins v12.d[0], v8.d[1]
929 ins v8.2d[1], x22 931 ins v8.d[1], x22
930 sub v1.8h, v11.8h, v13.8h 932 sub v1.8h, v11.8h, v13.8h
931 add v13.8h, v11.8h, v13.8h 933 add v13.8h, v11.8h, v13.8h
932 sub v5.8h, v9.8h, v15.8h 934 sub v5.8h, v9.8h, v15.8h
933 add v15.8h, v9.8h, v15.8h 935 add v15.8h, v9.8h, v15.8h
934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 936 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 937 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
936 add v3.8h, v1.8h, v1.8h 938 add v3.8h, v1.8h, v1.8h
937 sub v1.8h, v5.8h, v1.8h 939 sub v1.8h, v5.8h, v1.8h
938 add v10.8h, v2.8h, v4.8h 940 add v10.8h, v2.8h, v4.8h
939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 941 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
990 trn2 v11.8h, v18.8h, v11.8h 992 trn2 v11.8h, v18.8h, v11.8h
991 /* Transpose q8-q10 */ 993 /* Transpose q8-q10 */
992 mov v18.16b, v8.16b 994 mov v18.16b, v8.16b
993 trn1 v8.4s, v8.4s, v10.4s 995 trn1 v8.4s, v8.4s, v10.4s
994 trn2 v10.4s, v18.4s, v10.4s 996 trn2 v10.4s, v18.4s, v10.4s
995 /* Transpose q9-q11 */ 997 /* Transpose q9-q11 */
996 mov v18.16b, v9.16b 998 mov v18.16b, v9.16b
997 trn1 v9.4s, v9.4s, v11.4s 999 trn1 v9.4s, v9.4s, v11.4s
998 trn2 v11.4s, v18.4s, v11.4s 1000 trn2 v11.4s, v18.4s, v11.4s
999 /* make copy */ 1001 /* make copy */
1000 ins v17.2d[0], v8.2d[1] 1002 ins v17.d[0], v8.d[1]
1001 /* Transpose d16-d17-msb */ 1003 /* Transpose d16-d17-msb */
1002 mov v18.16b, v8.16b 1004 mov v18.16b, v8.16b
1003 trn1 v8.8b, v8.8b, v17.8b 1005 trn1 v8.8b, v8.8b, v17.8b
1004 trn2 v17.8b, v18.8b, v17.8b 1006 trn2 v17.8b, v18.8b, v17.8b
1005 /* make copy */ 1007 /* make copy */
1006 ins v19.2d[0], v9.2d[1] 1008 ins v19.d[0], v9.d[1]
1007 mov v18.16b, v9.16b 1009 mov v18.16b, v9.16b
1008 trn1 v9.8b, v9.8b, v19.8b 1010 trn1 v9.8b, v9.8b, v19.8b
1009 trn2 v19.8b, v18.8b, v19.8b 1011 trn2 v19.8b, v18.8b, v19.8b
1010 /* Store results to the output buffer */ 1012 /* Store results to the output buffer */
1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1013 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1012 add TMP1, TMP1, OUTPUT_COL 1014 add TMP1, TMP1, OUTPUT_COL
1013 add TMP2, TMP2, OUTPUT_COL 1015 add TMP2, TMP2, OUTPUT_COL
1014 st1 {v8.8b}, [TMP1] 1016 st1 {v8.8b}, [TMP1]
1015 st1 {v17.8b}, [TMP2] 1017 st1 {v17.8b}, [TMP2]
1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1018 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1017 add TMP1, TMP1, OUTPUT_COL 1019 add TMP1, TMP1, OUTPUT_COL
1018 add TMP2, TMP2, OUTPUT_COL 1020 add TMP2, TMP2, OUTPUT_COL
1019 st1 {v9.8b}, [TMP1] 1021 st1 {v9.8b}, [TMP1]
1020 /* make copy */ 1022 /* make copy */
1021 ins v7.2d[0], v10.2d[1] 1023 ins v7.d[0], v10.d[1]
1022 mov v18.16b, v10.16b 1024 mov v18.16b, v10.16b
1023 trn1 v10.8b, v10.8b, v7.8b 1025 trn1 v10.8b, v10.8b, v7.8b
1024 trn2 v7.8b, v18.8b, v7.8b 1026 trn2 v7.8b, v18.8b, v7.8b
1025 st1 {v19.8b}, [TMP2] 1027 st1 {v19.8b}, [TMP2]
1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1028 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16 1029 ldp TMP4, TMP5, [OUTPUT_BUF], 16
1028 add TMP1, TMP1, OUTPUT_COL 1030 add TMP1, TMP1, OUTPUT_COL
1029 add TMP2, TMP2, OUTPUT_COL 1031 add TMP2, TMP2, OUTPUT_COL
1030 add TMP4, TMP4, OUTPUT_COL 1032 add TMP4, TMP4, OUTPUT_COL
1031 add TMP5, TMP5, OUTPUT_COL 1033 add TMP5, TMP5, OUTPUT_COL
1032 st1 {v10.8b}, [TMP1] 1034 st1 {v10.8b}, [TMP1]
1033 /* make copy */ 1035 /* make copy */
1034 ins v16.2d[0], v11.2d[1] 1036 ins v16.d[0], v11.d[1]
1035 mov v18.16b, v11.16b 1037 mov v18.16b, v11.16b
1036 trn1 v11.8b, v11.8b, v16.8b 1038 trn1 v11.8b, v11.8b, v16.8b
1037 trn2 v16.8b, v18.8b, v16.8b 1039 trn2 v16.8b, v18.8b, v16.8b
1038 st1 {v7.8b}, [TMP2] 1040 st1 {v7.8b}, [TMP2]
1039 st1 {v11.8b}, [TMP4] 1041 st1 {v11.8b}, [TMP4]
1040 st1 {v16.8b}, [TMP5] 1042 st1 {v16.8b}, [TMP5]
1041 sub sp, sp, #176 1043 sub sp, sp, #176
1042 ldp x22, x23, [sp], 16 1044 ldp x22, x23, [sp], 16
1043 ld1 {v0.8b - v3.8b}, [sp], 32 1045 ld1 {v0.8b - v3.8b}, [sp], 32
1044 ld1 {v4.8b - v7.8b}, [sp], 32 1046 ld1 {v4.8b - v7.8b}, [sp], 32
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 1091 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 1092 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 1093 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 1094 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 1095 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 1096 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 1097 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
1096 1098
1097 .balign 16 1099 .balign 16
1098 jsimd_idct_4x4_neon_consts: 1100 jsimd_idct_4x4_neon_consts:
1099 .short FIX_1_847759065 /* v0.4h[0] */ 1101 .short FIX_1_847759065 /* v0.h[0] */
1100 .short -FIX_0_765366865 /* v0.4h[1] */ 1102 .short -FIX_0_765366865 /* v0.h[1] */
1101 .short -FIX_0_211164243 /* v0.4h[2] */ 1103 .short -FIX_0_211164243 /* v0.h[2] */
1102 .short FIX_1_451774981 /* v0.4h[3] */ 1104 .short FIX_1_451774981 /* v0.h[3] */
1103 .short -FIX_2_172734803 /* d1[0] */ 1105 .short -FIX_2_172734803 /* d1[0] */
1104 .short FIX_1_061594337 /* d1[1] */ 1106 .short FIX_1_061594337 /* d1[1] */
1105 .short -FIX_0_509795579 /* d1[2] */ 1107 .short -FIX_0_509795579 /* d1[2] */
1106 .short -FIX_0_601344887 /* d1[3] */ 1108 .short -FIX_0_601344887 /* d1[3] */
1107 .short FIX_0_899976223 /* v2.4h[0] */ 1109 .short FIX_0_899976223 /* v2.h[0] */
1108 .short FIX_2_562915447 /* v2.4h[1] */ 1110 .short FIX_2_562915447 /* v2.h[1] */
1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */ 1111 .short 1 << (CONST_BITS+1) /* v2.h[2] */
1110 .short 0 /* v2.4h[3] */ 1112 .short 0 /* v2.h[3] */
1111 1113
1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1114 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1113 smull v28.4s, \x4, v2.4h[2] 1115 smull v28.4s, \x4, v2.h[2]
1114 smlal v28.4s, \x8, v0.4h[0] 1116 smlal v28.4s, \x8, v0.h[0]
1115 smlal v28.4s, \x14, v0.4h[1] 1117 smlal v28.4s, \x14, v0.h[1]
1116 1118
1117 smull v26.4s, \x16, v1.4h[2] 1119 smull v26.4s, \x16, v1.h[2]
1118 smlal v26.4s, \x12, v1.4h[3] 1120 smlal v26.4s, \x12, v1.h[3]
1119 smlal v26.4s, \x10, v2.4h[0] 1121 smlal v26.4s, \x10, v2.h[0]
1120 smlal v26.4s, \x6, v2.4h[1] 1122 smlal v26.4s, \x6, v2.h[1]
1121 1123
1122 smull v30.4s, \x4, v2.4h[2] 1124 smull v30.4s, \x4, v2.h[2]
1123 smlsl v30.4s, \x8, v0.4h[0] 1125 smlsl v30.4s, \x8, v0.h[0]
1124 smlsl v30.4s, \x14, v0.4h[1] 1126 smlsl v30.4s, \x14, v0.h[1]
1125 1127
1126 smull v24.4s, \x16, v0.4h[2] 1128 smull v24.4s, \x16, v0.h[2]
1127 smlal v24.4s, \x12, v0.4h[3] 1129 smlal v24.4s, \x12, v0.h[3]
1128 smlal v24.4s, \x10, v1.4h[0] 1130 smlal v24.4s, \x10, v1.h[0]
1129 smlal v24.4s, \x6, v1.4h[1] 1131 smlal v24.4s, \x6, v1.h[1]
1130 1132
1131 add v20.4s, v28.4s, v26.4s 1133 add v20.4s, v28.4s, v26.4s
1132 sub v28.4s, v28.4s, v26.4s 1134 sub v28.4s, v28.4s, v26.4s
1133 1135
1134 .if \shift > 16 1136 .if \shift > 16
1135 srshr v20.4s, v20.4s, #\shift 1137 srshr v20.4s, v20.4s, #\shift
1136 srshr v28.4s, v28.4s, #\shift 1138 srshr v28.4s, v28.4s, #\shift
1137 xtn \y26, v20.4s 1139 xtn \y26, v20.4s
1138 xtn \y29, v28.4s 1140 xtn \y29, v28.4s
1139 .else 1141 .else
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
1196 */ 1198 */
1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1199 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1200 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1199 add COEF_BLOCK, COEF_BLOCK, #16 1201 add COEF_BLOCK, COEF_BLOCK, #16
1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1202 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1203 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1202 /* dequantize */ 1204 /* dequantize */
1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1205 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1204 mul v4.4h, v4.4h, v18.4h 1206 mul v4.4h, v4.4h, v18.4h
1205 mul v5.4h, v5.4h, v19.4h 1207 mul v5.4h, v5.4h, v19.4h
1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ 1208 ins v4.d[1], v5.d[0] /* 128 bit q4 */
1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1209 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1208 mul v6.4h, v6.4h, v20.4h 1210 mul v6.4h, v6.4h, v20.4h
1209 mul v7.4h, v7.4h, v21.4h 1211 mul v7.4h, v7.4h, v21.4h
1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ 1212 ins v6.d[1], v7.d[0] /* 128 bit q6 */
1211 mul v8.4h, v8.4h, v22.4h 1213 mul v8.4h, v8.4h, v22.4h
1212 mul v9.4h, v9.4h, v23.4h 1214 mul v9.4h, v9.4h, v23.4h
1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ 1215 ins v8.d[1], v9.d[0] /* 128 bit q8 */
1214 add DCT_TABLE, DCT_TABLE, #16 1216 add DCT_TABLE, DCT_TABLE, #16
1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1217 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1216 mul v10.4h, v10.4h, v24.4h 1218 mul v10.4h, v10.4h, v24.4h
1217 mul v11.4h, v11.4h, v25.4h 1219 mul v11.4h, v11.4h, v25.4h
1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ 1220 ins v10.d[1], v11.d[0] /* 128 bit q10 */
1219 mul v12.4h, v12.4h, v26.4h 1221 mul v12.4h, v12.4h, v26.4h
1220 mul v13.4h, v13.4h, v27.4h 1222 mul v13.4h, v13.4h, v27.4h
1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ 1223 ins v12.d[1], v13.d[0] /* 128 bit q12 */
1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1224 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1223 mul v14.4h, v14.4h, v28.4h 1225 mul v14.4h, v14.4h, v28.4h
1224 mul v15.4h, v15.4h, v29.4h 1226 mul v15.4h, v15.4h, v29.4h
1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ 1227 ins v14.d[1], v15.d[0] /* 128 bit q14 */
1226 mul v16.4h, v16.4h, v30.4h 1228 mul v16.4h, v16.4h, v30.4h
1227 mul v17.4h, v17.4h, v31.4h 1229 mul v17.4h, v17.4h, v31.4h
1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ 1230 ins v16.d[1], v17.d[0] /* 128 bit q16 */
1229 1231
1230 /* Pass 1 */ 1232 /* Pass 1 */
1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h 1233 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h
1232 transpose_4x4 v4, v6, v8, v10, v3 1234 transpose_4x4 v4, v6, v8, v10, v3
1233 ins v10.2d[1], v11.2d[0] 1235 ins v10.d[1], v11.d[0]
1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h 1236 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h
1235 transpose_4x4 v5, v7, v9, v11, v3 1237 transpose_4x4 v5, v7, v9, v11, v3
1236 ins v10.2d[1], v11.2d[0] 1238 ins v10.d[1], v11.d[0]
1237 /* Pass 2 */ 1239 /* Pass 2 */
1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h 1240 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h
1239 transpose_4x4 v26, v27, v28, v29, v3 1241 transpose_4x4 v26, v27, v28, v29, v3
1240 1242
1241 /* Range limit */ 1243 /* Range limit */
1242 movi v30.8h, #0x80 1244 movi v30.8h, #0x80
1243 ins v26.2d[1], v27.2d[0] 1245 ins v26.d[1], v27.d[0]
1244 ins v28.2d[1], v29.2d[0] 1246 ins v28.d[1], v29.d[0]
1245 add v26.8h, v26.8h, v30.8h 1247 add v26.8h, v26.8h, v30.8h
1246 add v28.8h, v28.8h, v30.8h 1248 add v28.8h, v28.8h, v30.8h
1247 sqxtun v26.8b, v26.8h 1249 sqxtun v26.8b, v26.8h
1248 sqxtun v27.8b, v28.8h 1250 sqxtun v27.8b, v28.8h
1249 1251
1250 /* Store results to the output buffer */ 1252 /* Store results to the output buffer */
1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1253 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1252 ldp TMP3, TMP4, [OUTPUT_BUF] 1254 ldp TMP3, TMP4, [OUTPUT_BUF]
1253 add TMP1, TMP1, OUTPUT_COL 1255 add TMP1, TMP1, OUTPUT_COL
1254 add TMP2, TMP2, OUTPUT_COL 1256 add TMP2, TMP2, OUTPUT_COL
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
1326 1328
1327 .balign 8 1329 .balign 8
1328 jsimd_idct_2x2_neon_consts: 1330 jsimd_idct_2x2_neon_consts:
1329 .short -FIX_0_720959822 /* v14[0] */ 1331 .short -FIX_0_720959822 /* v14[0] */
1330 .short FIX_0_850430095 /* v14[1] */ 1332 .short FIX_0_850430095 /* v14[1] */
1331 .short -FIX_1_272758580 /* v14[2] */ 1333 .short -FIX_1_272758580 /* v14[2] */
1332 .short FIX_3_624509785 /* v14[3] */ 1334 .short FIX_3_624509785 /* v14[3] */
1333 1335
1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1336 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1335 sshll v15.4s, \x4, #15 1337 sshll v15.4s, \x4, #15
1336 smull v26.4s, \x6, v14.4h[3] 1338 smull v26.4s, \x6, v14.h[3]
1337 smlal v26.4s, \x10, v14.4h[2] 1339 smlal v26.4s, \x10, v14.h[2]
1338 smlal v26.4s, \x12, v14.4h[1] 1340 smlal v26.4s, \x12, v14.h[1]
1339 smlal v26.4s, \x16, v14.4h[0] 1341 smlal v26.4s, \x16, v14.h[0]
1340 1342
1341 add v20.4s, v15.4s, v26.4s 1343 add v20.4s, v15.4s, v26.4s
1342 sub v15.4s, v15.4s, v26.4s 1344 sub v15.4s, v15.4s, v26.4s
1343 1345
1344 .if \shift > 16 1346 .if \shift > 16
1345 srshr v20.4s, v20.4s, #\shift 1347 srshr v20.4s, v20.4s, #\shift
1346 srshr v15.4s, v15.4s, #\shift 1348 srshr v15.4s, v15.4s, #\shift
1347 xtn \y26, v20.4s 1349 xtn \y26, v20.4s
1348 xtn \y27, v15.4s 1350 xtn \y27, v15.4s
1349 .else 1351 .else
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1393 add COEF_BLOCK, COEF_BLOCK, #16 1395 add COEF_BLOCK, COEF_BLOCK, #16
1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1396 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1395 add COEF_BLOCK, COEF_BLOCK, #16 1397 add COEF_BLOCK, COEF_BLOCK, #16
1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1398 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1397 add COEF_BLOCK, COEF_BLOCK, #16 1399 add COEF_BLOCK, COEF_BLOCK, #16
1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1400 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1399 /* Dequantize */ 1401 /* Dequantize */
1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1402 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1401 mul v4.4h, v4.4h, v18.4h 1403 mul v4.4h, v4.4h, v18.4h
1402 mul v5.4h, v5.4h, v19.4h 1404 mul v5.4h, v5.4h, v19.4h
1403 ins v4.2d[1], v5.2d[0] 1405 ins v4.d[1], v5.d[0]
1404 mul v6.4h, v6.4h, v20.4h 1406 mul v6.4h, v6.4h, v20.4h
1405 mul v7.4h, v7.4h, v21.4h 1407 mul v7.4h, v7.4h, v21.4h
1406 ins v6.2d[1], v7.2d[0] 1408 ins v6.d[1], v7.d[0]
1407 add DCT_TABLE, DCT_TABLE, #16 1409 add DCT_TABLE, DCT_TABLE, #16
1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1410 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1409 mul v10.4h, v10.4h, v24.4h 1411 mul v10.4h, v10.4h, v24.4h
1410 mul v11.4h, v11.4h, v25.4h 1412 mul v11.4h, v11.4h, v25.4h
1411 ins v10.2d[1], v11.2d[0] 1413 ins v10.d[1], v11.d[0]
1412 add DCT_TABLE, DCT_TABLE, #16 1414 add DCT_TABLE, DCT_TABLE, #16
1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1415 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1414 mul v12.4h, v12.4h, v26.4h 1416 mul v12.4h, v12.4h, v26.4h
1415 mul v13.4h, v13.4h, v27.4h 1417 mul v13.4h, v13.4h, v27.4h
1416 ins v12.2d[1], v13.2d[0] 1418 ins v12.d[1], v13.d[0]
1417 add DCT_TABLE, DCT_TABLE, #16 1419 add DCT_TABLE, DCT_TABLE, #16
1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1420 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1419 mul v16.4h, v16.4h, v30.4h 1421 mul v16.4h, v16.4h, v30.4h
1420 mul v17.4h, v17.4h, v31.4h 1422 mul v17.4h, v17.4h, v31.4h
1421 ins v16.2d[1], v17.2d[0] 1423 ins v16.d[1], v17.d[0]
1422 1424
1423 /* Pass 1 */ 1425 /* Pass 1 */
1424 #if 0 1426 #if 0
1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1427 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1428 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1429 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1430 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1429 #else 1431 #else
1430 smull v26.4s, v6.4h, v14.4h[3] 1432 smull v26.4s, v6.4h, v14.h[3]
1431 smlal v26.4s, v10.4h, v14.4h[2] 1433 smlal v26.4s, v10.4h, v14.h[2]
1432 smlal v26.4s, v12.4h, v14.4h[1] 1434 smlal v26.4s, v12.4h, v14.h[1]
1433 smlal v26.4s, v16.4h, v14.4h[0] 1435 smlal v26.4s, v16.4h, v14.h[0]
1434 smull v24.4s, v7.4h, v14.4h[3] 1436 smull v24.4s, v7.4h, v14.h[3]
1435 smlal v24.4s, v11.4h, v14.4h[2] 1437 smlal v24.4s, v11.4h, v14.h[2]
1436 smlal v24.4s, v13.4h, v14.4h[1] 1438 smlal v24.4s, v13.4h, v14.h[1]
1437 smlal v24.4s, v17.4h, v14.4h[0] 1439 smlal v24.4s, v17.4h, v14.h[0]
1438 sshll v15.4s, v4.4h, #15 1440 sshll v15.4s, v4.4h, #15
1439 sshll v30.4s, v5.4h, #15 1441 sshll v30.4s, v5.4h, #15
1440 add v20.4s, v15.4s, v26.4s 1442 add v20.4s, v15.4s, v26.4s
1441 sub v15.4s, v15.4s, v26.4s 1443 sub v15.4s, v15.4s, v26.4s
1442 rshrn v4.4h, v20.4s, #13 1444 rshrn v4.4h, v20.4s, #13
1443 rshrn v6.4h, v15.4s, #13 1445 rshrn v6.4h, v15.4s, #13
1444 add v20.4s, v30.4s, v24.4s 1446 add v20.4s, v30.4s, v24.4s
1445 sub v15.4s, v30.4s, v24.4s 1447 sub v15.4s, v30.4s, v24.4s
1446 rshrn v5.4h, v20.4s, #13 1448 rshrn v5.4h, v20.4s, #13
1447 rshrn v7.4h, v15.4s, #13 1449 rshrn v7.4h, v15.4s, #13
1448 ins v4.2d[1], v5.2d[0] 1450 ins v4.d[1], v5.d[0]
1449 ins v6.2d[1], v7.2d[0] 1451 ins v6.d[1], v7.d[0]
1450 transpose v4, v6, v3, .16b, .8h 1452 transpose v4, v6, v3, .16b, .8h
1451 transpose v6, v10, v3, .16b, .4s 1453 transpose v6, v10, v3, .16b, .4s
1452 ins v11.2d[0], v10.2d[1] 1454 ins v11.d[0], v10.d[1]
1453 ins v7.2d[0], v6.2d[1] 1455 ins v7.d[0], v6.d[1]
1454 #endif 1456 #endif
1455 1457
1456 /* Pass 2 */ 1458 /* Pass 2 */
1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1459 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1458 1460
1459 /* Range limit */ 1461 /* Range limit */
1460 movi v30.8h, #0x80 1462 movi v30.8h, #0x80
1461 ins v26.2d[1], v27.2d[0] 1463 ins v26.d[1], v27.d[0]
1462 add v26.8h, v26.8h, v30.8h 1464 add v26.8h, v26.8h, v30.8h
1463 sqxtun v30.8b, v26.8h 1465 sqxtun v30.8b, v26.8h
1464 ins v26.2d[0], v30.2d[0] 1466 ins v26.d[0], v30.d[0]
1465 sqxtun v27.8b, v26.8h 1467 sqxtun v27.8b, v26.8h
1466 1468
1467 /* Store results to the output buffer */ 1469 /* Store results to the output buffer */
1468 ldp TMP1, TMP2, [OUTPUT_BUF] 1470 ldp TMP1, TMP2, [OUTPUT_BUF]
1469 add TMP1, TMP1, OUTPUT_COL 1471 add TMP1, TMP1, OUTPUT_COL
1470 add TMP2, TMP2, OUTPUT_COL 1472 add TMP2, TMP2, OUTPUT_COL
1471 1473
1472 st1 {v26.b}[0], [TMP1], 1 1474 st1 {v26.b}[0], [TMP1], 1
1473 st1 {v27.b}[4], [TMP1], 1 1475 st1 {v27.b}[4], [TMP1], 1
1474 st1 {v26.b}[1], [TMP2], 1 1476 st1 {v26.b}[1], [TMP2], 1
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
1597 .endif 1599 .endif
1598 .endm 1600 .endm
1599 1601
1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize 1602 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
1601 1603
1602 /* 1604 /*
1603 * 2-stage pipelined YCbCr->RGB conversion 1605 * 2-stage pipelined YCbCr->RGB conversion
1604 */ 1606 */
1605 1607
1606 .macro do_yuv_to_rgb_stage1 1608 .macro do_yuv_to_rgb_stage1
1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1609 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1610 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ 1611 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ 1612 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ 1613 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ 1614 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ 1615 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ 1616 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ 1617 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ 1618 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1617 .endm 1619 .endm
1618 1620
1619 .macro do_yuv_to_rgb_stage2 1621 .macro do_yuv_to_rgb_stage2
1620 rshrn v20.4h, v20.4s, #15 1622 rshrn v20.4h, v20.4s, #15
1621 rshrn2 v20.8h, v22.4s, #15 1623 rshrn2 v20.8h, v22.4s, #15
1622 rshrn v24.4h, v24.4s, #14 1624 rshrn v24.4h, v24.4s, #14
1623 rshrn2 v24.8h, v26.4s, #14 1625 rshrn2 v24.8h, v26.4s, #14
1624 rshrn v28.4h, v28.4s, #14 1626 rshrn v28.4h, v28.4s, #14
1625 rshrn2 v28.8h, v30.4s, #14 1627 rshrn2 v28.8h, v30.4s, #14
1626 uaddw v20.8h, v20.8h, v0.8b 1628 uaddw v20.8h, v20.8h, v0.8b
(...skipping 26 matching lines...) Expand all
1653 uaddw v24.8h, v24.8h, v0.8b 1655 uaddw v24.8h, v24.8h, v0.8b
1654 uaddw v28.8h, v28.8h, v0.8b 1656 uaddw v28.8h, v28.8h, v0.8b
1655 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ 1657 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/
1656 sqxtun v1\g_offs\defsize, v20.8h 1658 sqxtun v1\g_offs\defsize, v20.8h
1657 ld1 {v0.8b}, [Y], 8 1659 ld1 {v0.8b}, [Y], 8
1658 sqxtun v1\r_offs\defsize, v24.8h 1660 sqxtun v1\r_offs\defsize, v24.8h
1659 prfm PLDL1KEEP, [U, #64] 1661 prfm PLDL1KEEP, [U, #64]
1660 prfm PLDL1KEEP, [V, #64] 1662 prfm PLDL1KEEP, [V, #64]
1661 prfm PLDL1KEEP, [Y, #64] 1663 prfm PLDL1KEEP, [Y, #64]
1662 sqxtun v1\b_offs\defsize, v28.8h 1664 sqxtun v1\b_offs\defsize, v28.8h
1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1665 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1666 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ 1667 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ 1668 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ 1669 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ 1670 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ 1671 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ 1672 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1671 .else /**************************** rgb565 ***********************************/ 1673 .else /**************************** rgb565 ***********************************/
1672 sqshlu v21.8h, v20.8h, #8 1674 sqshlu v21.8h, v20.8h, #8
1673 sqshlu v25.8h, v24.8h, #8 1675 sqshlu v25.8h, v24.8h, #8
1674 sqshlu v29.8h, v28.8h, #8 1676 sqshlu v29.8h, v28.8h, #8
1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1677 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1678 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1677 ld1 {v0.8b}, [Y], 8 1679 ld1 {v0.8b}, [Y], 8
1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ 1680 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ 1681 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ 1682 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ 1683 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1682 sri v25.8h, v21.8h, #5 1684 sri v25.8h, v21.8h, #5
1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ 1685 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ 1686 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1685 prfm PLDL1KEEP, [U, #64] 1687 prfm PLDL1KEEP, [U, #64]
1686 prfm PLDL1KEEP, [V, #64] 1688 prfm PLDL1KEEP, [V, #64]
1687 prfm PLDL1KEEP, [Y, #64] 1689 prfm PLDL1KEEP, [Y, #64]
1688 sri v25.8h, v29.8h, #11 1690 sri v25.8h, v29.8h, #11
1689 .endif 1691 .endif
1690 do_store \bpp, 8 1692 do_store \bpp, 8
1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ 1693 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ 1694 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1693 .endm 1695 .endm
1694 1696
1695 .macro do_yuv_to_rgb 1697 .macro do_yuv_to_rgb
1696 do_yuv_to_rgb_stage1 1698 do_yuv_to_rgb_stage1
1697 do_yuv_to_rgb_stage2 1699 do_yuv_to_rgb_stage2
1698 .endm 1700 .endm
1699 1701
1700 /* Apple gas crashes on adrl, work around that by using adr. 1702 /* Apple gas crashes on adrl, work around that by using adr.
1701 * But this requires a copy of these constants for each function. 1703 * But this requires a copy of these constants for each function.
1702 */ 1704 */
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after
1852 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */ 1854 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */
1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b 1855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b
1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b 1856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b
1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b 1857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b
1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b 1858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b
1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b 1859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b
1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b 1860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b
1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b 1861 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b
1860 .purgem do_load 1862 .purgem do_load
1861 .purgem do_store 1863 .purgem do_store
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698