simd/jsimd_arm_neon_64.S - Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo

Side by Side Diff: simd/jsimd_arm_neon_64.S

Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * ARMv8 NEON optimizations for libjpeg-turbo

	3 *

	4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).

	5 * All rights reserved.

	6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>

	7 * Copyright (C) 2013, Linaro Limited

	8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>

	9 *

	10 * This software is provided 'as-is', without any express or implied

	11 * warranty. In no event will the authors be held liable for any damages

	12 * arising from the use of this software.

	13 *

	14 * Permission is granted to anyone to use this software for any purpose,

	15 * including commercial applications, and to alter it and redistribute it

	16 * freely, subject to the following restrictions:

	17 *

	18 * 1. The origin of this software must not be misrepresented; you must not

	19 * claim that you wrote the original software. If you use this software

	20 * in a product, an acknowledgment in the product documentation would be

	21 * appreciated but is not required.

	22 * 2. Altered source versions must be plainly marked as such, and must not be

	23 * misrepresented as being the original software.

	24 * 3. This notice may not be removed or altered from any source distribution.

	25 */

	26

	27 #if defined(__linux__) && defined(__ELF__)

	28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */

	29 #endif

	30

	31 .text

	32 .arch armv8-a+fp+simd

	33

	34

	35 #define RESPECT_STRICT_ALIGNMENT 1

	36

	37

	38

	39 /*****************************************************************************/

	40

	41 /* Supplementary macro for setting function attributes */

	42 .macro asm_function fname

	43 #ifdef __APPLE__

	44 .func _\fname

	45 .globl _\fname

	46 _\fname:

	47 #else

	48 .func \fname

	49 .global \fname

	50 #ifdef __ELF__

	51 .hidden \fname

	52 .type \fname, %function

	53 #endif

	54 \fname:

	55 #endif

	56 .endm

	57

	58 /* Transpose elements of single 128 bit registers */

	59 .macro transpose_single x0,x1,xi,xilen,literal

	60 ins \xi\xilen[0], \x0\xilen[0]

	61 ins \x1\xilen[0], \x0\xilen[1]

	62 trn1 \x0\literal, \x0\literal, \x1\literal

	63 trn2 \x1\literal, \xi\literal, \x1\literal

	64 .endm

	65

	66 /* Transpose elements of 2 differnet registers */

	67 .macro transpose x0,x1,xi,xilen,literal

	68 mov \xi\xilen, \x0\xilen

	69 trn1 \x0\literal, \x0\literal, \x1\literal

	70 trn2 \x1\literal, \xi\literal, \x1\literal

	71 .endm

	72

	73 /* Transpose a block of 4x4 coefficients in four 64-bit registers */

	74 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen

	75 mov \xi\xilen, \x0\xilen

	76 trn1 \x0\x0len, \x0\x0len, \x2\x2len

	77 trn2 \x2\x2len, \xi\x0len, \x2\x2len

	78 mov \xi\xilen, \x1\xilen

	79 trn1 \x1\x1len, \x1\x1len, \x3\x3len

	80 trn2 \x3\x3len, \xi\x1len, \x3\x3len

	81 .endm

	82

	83 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen

	84 mov \xi\xilen, \x0\xilen

	85 trn1 \x0\x0len, \x0\x0len, \x1\x1len

	86 trn2 \x1\x2len, \xi\x0len, \x1\x2len

	87 mov \xi\xilen, \x2\xilen

	88 trn1 \x2\x2len, \x2\x2len, \x3\x3len

	89 trn2 \x3\x2len, \xi\x1len, \x3\x3len

	90 .endm

	91

	92 .macro transpose_4x4 x0, x1, x2, x3,x5

	93 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b

	94 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b

	95 .endm

	96

	97

	98 #define CENTERJSAMPLE 128

	99

	100 /*****************************************************************************/

	101

	102 /*

	103 * Perform dequantization and inverse DCT on one block of coefficients.

	104 *

	105 * GLOBAL(void)

	106 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,

	107 * JSAMPARRAY output_buf, JDIMENSION output_col)

	108 */

	109

	110 #define FIX_0_298631336 (2446)

	111 #define FIX_0_390180644 (3196)

	112 #define FIX_0_541196100 (4433)

	113 #define FIX_0_765366865 (6270)

	114 #define FIX_0_899976223 (7373)

	115 #define FIX_1_175875602 (9633)

	116 #define FIX_1_501321110 (12299)

	117 #define FIX_1_847759065 (15137)

	118 #define FIX_1_961570560 (16069)

	119 #define FIX_2_053119869 (16819)

	120 #define FIX_2_562915447 (20995)

	121 #define FIX_3_072711026 (25172)

	122

	123 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)

	124 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)

	125 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)

	126 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)

	127 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)

	128 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)

	129 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)

	130 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)

	131

	132 /*

	133 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.

	134 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'

	135 */

	136 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \

	137 { \

	138 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \

	139 INT32 q1, q2, q3, q4, q5, q6, q7; \

	140 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \

	141 \

	142 /* 1-D iDCT input data */ \

	143 row0 = xrow0; \

	144 row1 = xrow1; \

	145 row2 = xrow2; \

	146 row3 = xrow3; \

	147 row4 = xrow4; \

	148 row5 = xrow5; \

	149 row6 = xrow6; \

	150 row7 = xrow7; \

	151 \

	152 q5 = row7 + row3; \

	153 q4 = row5 + row1; \

	154 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \

	155 MULTIPLY(q4, FIX_1_175875602); \

	156 q7 = MULTIPLY(q5, FIX_1_175875602) + \

	157 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \

	158 q2 = MULTIPLY(row2, FIX_0_541196100) + \

	159 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \

	160 q4 = q6; \

	161 q3 = ((INT32) row0 - (INT32) row4) << 13; \

	162 q6 += MULTIPLY(row5, -FIX_2_562915447) + \

	163 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \

	164 /* now we can use q1 (reloadable constants have been used up) */ \

	165 q1 = q3 + q2; \

	166 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \

	167 MULTIPLY(row1, -FIX_0_899976223); \

	168 q5 = q7; \

	169 q1 = q1 + q6; \

	170 q7 += MULTIPLY(row7, -FIX_0_899976223) + \

	171 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \

	172 \

	173 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \

	174 tmp11_plus_tmp2 = q1; \

	175 row1 = 0; \

	176 \

	177 q1 = q1 - q6; \

	178 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \

	179 MULTIPLY(row3, -FIX_2_562915447); \

	180 q1 = q1 - q6; \

	181 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \

	182 MULTIPLY(row6, FIX_0_541196100); \

	183 q3 = q3 - q2; \

	184 \

	185 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \

	186 tmp11_minus_tmp2 = q1; \

	187 \

	188 q1 = ((INT32) row0 + (INT32) row4) << 13; \

	189 q2 = q1 + q6; \

	190 q1 = q1 - q6; \

	191 \

	192 /* pick up the results */ \

	193 tmp0 = q4; \

	194 tmp1 = q5; \

	195 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \

	196 tmp3 = q7; \

	197 tmp10 = q2; \

	198 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \

	199 tmp12 = q3; \

	200 tmp13 = q1; \

	201 }

	202

	203 #define XFIX_0_899976223 v0.4h[0]

	204 #define XFIX_0_541196100 v0.4h[1]

	205 #define XFIX_2_562915447 v0.4h[2]

	206 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]

	207 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]

	208 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]

	209 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]

	210 #define XFIX_1_175875602 v1.4h[3]

	211 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]

	212 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]

	213 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]

	214 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]

	215

	216 .balign 16

	217 jsimd_idct_islow_neon_consts:

	218 .short FIX_0_899976223 /* d0[0] */

	219 .short FIX_0_541196100 /* d0[1] */

	220 .short FIX_2_562915447 /* d0[2] */

	221 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */

	222 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */

	223 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */

	224 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */

	225 .short FIX_1_175875602 /* d1[3] */

	226 /* reloadable constants */

	227 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */

	228 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */

	229 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */

	230 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */

	231

	232 asm_function jsimd_idct_islow_neon

	233

	234 DCT_TABLE .req x0

	235 COEF_BLOCK .req x1

	236 OUTPUT_BUF .req x2

	237 OUTPUT_COL .req x3

	238 TMP1 .req x0

	239 TMP2 .req x1

	240 TMP3 .req x2

	241 TMP4 .req x15

	242

	243 ROW0L .req v16

	244 ROW0R .req v17

	245 ROW1L .req v18

	246 ROW1R .req v19

	247 ROW2L .req v20

	248 ROW2R .req v21

	249 ROW3L .req v22

	250 ROW3R .req v23

	251 ROW4L .req v24

	252 ROW4R .req v25

	253 ROW5L .req v26

	254 ROW5R .req v27

	255 ROW6L .req v28

	256 ROW6R .req v29

	257 ROW7L .req v30

	258 ROW7R .req v31

	259 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */

	260 sub sp, sp, 272

	261 str x15, [sp], 16

	262 adr x15, jsimd_idct_islow_neon_consts

	263 st1 {v0.8b - v3.8b}, [sp], 32

	264 st1 {v4.8b - v7.8b}, [sp], 32

	265 st1 {v8.8b - v11.8b}, [sp], 32

	266 st1 {v12.8b - v15.8b}, [sp], 32

	267 st1 {v16.8b - v19.8b}, [sp], 32

	268 st1 {v20.8b - v23.8b}, [sp], 32

	269 st1 {v24.8b - v27.8b}, [sp], 32

	270 st1 {v28.8b - v31.8b}, [sp], 32

	271 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32

	272 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32

	273 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32

	274 mul v16.4h, v16.4h, v0.4h

	275 mul v17.4h, v17.4h, v1.4h

	276 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */

	277 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32

	278 mul v18.4h, v18.4h, v2.4h

	279 mul v19.4h, v19.4h, v3.4h

	280 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */

	281 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32

	282 mul v20.4h, v20.4h, v4.4h

	283 mul v21.4h, v21.4h, v5.4h

	284 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */

	285 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32

	286 mul v22.4h, v22.4h, v6.4h

	287 mul v23.4h, v23.4h, v7.4h

	288 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */

	289 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]

	290 mul v24.4h, v24.4h, v0.4h

	291 mul v25.4h, v25.4h, v1.4h

	292 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */

	293 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32

	294 mul v28.4h, v28.4h, v4.4h

	295 mul v29.4h, v29.4h, v5.4h

	296 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */

	297 mul v26.4h, v26.4h, v2.4h

	298 mul v27.4h, v27.4h, v3.4h

	299 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */

	300 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */

	301 add x15, x15, #16

	302 mul v30.4h, v30.4h, v6.4h

	303 mul v31.4h, v31.4h, v7.4h

	304 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */

	305 /* Go to the bottom of the stack */

	306 sub sp, sp, 352

	307 stp x4, x5, [sp], 16

	308 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */

	309 st1 {v12.4h - v15.4h}, [sp], 32

	310 /* 1-D IDCT, pass 1, left 4x8 half */

	311 add v4.4h, ROW7L.4h, ROW3L.4h

	312 add v5.4h, ROW5L.4h, ROW1L.4h

	313 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560

	314 smlal v12.4s, v5.4h, XFIX_1_175875602

	315 smull v14.4s, v4.4h, XFIX_1_175875602

	316 /* Check for the zero coefficients in the right 4x8 half */

	317 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644

	318 ssubl v6.4s, ROW0L.4h, ROW4L.4h

	319 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]

	320 smull v4.4s, ROW2L.4h, XFIX_0_541196100

	321 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065

	322 orr x0, x4, x5

	323 mov v8.16b, v12.16b

	324 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447

	325 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]

	326 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	327 shl v6.4s, v6.4s, #13

	328 orr x0, x0, x4

	329 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	330 orr x0, x0 , x5

	331 add v2.4s, v6.4s, v4.4s

	332 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]

	333 mov v10.16b, v14.16b

	334 add v2.4s, v2.4s, v12.4s

	335 orr x0, x0, x4

	336 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223

	337 orr x0, x0, x5

	338 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	339 rshrn ROW1L.4h, v2.4s, #11

	340 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]

	341 sub v2.4s, v2.4s, v12.4s

	342 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447

	343 orr x0, x0, x4

	344 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	345 orr x0, x0, x5

	346 sub v2.4s, v2.4s, v12.4s

	347 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	348 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]

	349 smlal v12.4s, ROW6L.4h, XFIX_0_541196100

	350 sub v6.4s, v6.4s, v4.4s

	351 orr x0, x0, x4

	352 rshrn ROW6L.4h, v2.4s, #11

	353 orr x0, x0, x5

	354 add v2.4s, v6.4s, v10.4s

	355 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]

	356 sub v6.4s, v6.4s, v10.4s

	357 saddl v10.4s, ROW0L.4h, ROW4L.4h

	358 orr x0, x0, x4

	359 rshrn ROW2L.4h, v2.4s, #11

	360 orr x0, x0, x5

	361 rshrn ROW5L.4h, v6.4s, #11

	362 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]

	363 shl v10.4s, v10.4s, #13

	364 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223

	365 orr x0, x0, x4

	366 add v4.4s, v10.4s, v12.4s

	367 orr x0, x0, x5

	368 sub v2.4s, v10.4s, v12.4s

	369 add v12.4s, v4.4s, v14.4s

	370 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]

	371 sub v4.4s, v4.4s, v14.4s

	372 add v10.4s, v2.4s, v8.4s

	373 orr x0, x4, x5

	374 sub v6.4s, v2.4s, v8.4s

	375 /* pop {x4, x5} */

	376 sub sp, sp, 80

	377 ldp x4, x5, [sp], 16

	378 rshrn ROW7L.4h, v4.4s, #11

	379 rshrn ROW3L.4h, v10.4s, #11

	380 rshrn ROW0L.4h, v12.4s, #11

	381 rshrn ROW4L.4h, v6.4s, #11

	382 cmp x0, #0 /* orrs instruction removed */

	383

	384 beq 3f /* Go to do some special handling for the sparse right 4x8 half */

	385

	386 /* 1-D IDCT, pass 1, right 4x8 half */

	387 ld1 {v2.4h}, [x15] /* reload constants */

	388 add v10.4h, ROW7R.4h, ROW3R.4h

	389 add v8.4h, ROW5R.4h, ROW1R.4h

	390 /* Transpose ROW6L <-> ROW7L (v3 available free register) */

	391 transpose ROW6L, ROW7L, v3, .16b, .4h

	392 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560

	393 smlal v12.4s, v8.4h, XFIX_1_175875602

	394 /* Transpose ROW2L <-> ROW3L (v3 available free register) */

	395 transpose ROW2L, ROW3L, v3, .16b, .4h

	396 smull v14.4s, v10.4h, XFIX_1_175875602

	397 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644

	398 /* Transpose ROW0L <-> ROW1L (v3 available free register) */

	399 transpose ROW0L, ROW1L, v3, .16b, .4h

	400 ssubl v6.4s, ROW0R.4h, ROW4R.4h

	401 smull v4.4s, ROW2R.4h, XFIX_0_541196100

	402 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

	403 /* Transpose ROW4L <-> ROW5L (v3 available free register) */

	404 transpose ROW4L, ROW5L, v3, .16b, .4h

	405 mov v8.16b, v12.16b

	406 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

	407 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447

	408 /* Transpose ROW1L <-> ROW3L (v3 available free register) */

	409 transpose ROW1L, ROW3L, v3, .16b, .2s

	410 shl v6.4s, v6.4s, #13

	411 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223

	412 /* Transpose ROW4L <-> ROW6L (v3 available free register) */

	413 transpose ROW4L, ROW6L, v3, .16b, .2s

	414 add v2.4s, v6.4s, v4.4s

	415 mov v10.16b, v14.16b

	416 add v2.4s, v2.4s, v12.4s

	417 /* Transpose ROW0L <-> ROW2L (v3 available free register) */

	418 transpose ROW0L, ROW2L, v3, .16b, .2s

	419 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

	420 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223

	421 rshrn ROW1R.4h, v2.4s, #11

	422 /* Transpose ROW5L <-> ROW7L (v3 available free register) */

	423 transpose ROW5L, ROW7L, v3, .16b, .2s

	424 sub v2.4s, v2.4s, v12.4s

	425 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

	426 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447

	427 sub v2.4s, v2.4s, v12.4s

	428 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865

	429 smlal v12.4s, ROW6R.4h, XFIX_0_541196100

	430 sub v6.4s, v6.4s, v4.4s

	431 rshrn ROW6R.4h, v2.4s, #11

	432 add v2.4s, v6.4s, v10.4s

	433 sub v6.4s, v6.4s, v10.4s

	434 saddl v10.4s, ROW0R.4h, ROW4R.4h

	435 rshrn ROW2R.4h, v2.4s, #11

	436 rshrn ROW5R.4h, v6.4s, #11

	437 shl v10.4s, v10.4s, #13

	438 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

	439 add v4.4s, v10.4s, v12.4s

	440 sub v2.4s, v10.4s, v12.4s

	441 add v12.4s, v4.4s, v14.4s

	442 sub v4.4s, v4.4s, v14.4s

	443 add v10.4s, v2.4s, v8.4s

	444 sub v12.4s, v2.4s, v8.4s

	445 rshrn ROW7R.4h, v4.4s, #11

	446 rshrn ROW3R.4h, v10.4s, #11

	447 rshrn ROW0R.4h, v12.4s, #11

	448 rshrn ROW4R.4h, v6.4s, #11

	449 /* Transpose right 4x8 half */

	450 transpose ROW6R, ROW7R, v3, .16b, .4h

	451 transpose ROW2R, ROW3R, v3, .16b, .4h

	452 transpose ROW0R, ROW1R, v3, .16b, .4h

	453 transpose ROW4R, ROW5R, v3, .16b, .4h

	454 transpose ROW1R, ROW3R, v3, .16b, .2s

	455 transpose ROW4R, ROW6R, v3, .16b, .2s

	456 transpose ROW0R, ROW2R, v3, .16b, .2s

	457 transpose ROW5R, ROW7R, v3, .16b, .2s

	458

	459 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */

	460 ld1 {v2.4h}, [x15] /* reload constants */

	461 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */

	462 smlal v12.4s, ROW1L.4h, XFIX_1_175875602

	463 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */

	464 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560

	465 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */

	466 smlal v14.4s, ROW3L.4h, XFIX_1_175875602

	467 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */

	468 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644

	469 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

	470 smull v4.4s, ROW2L.4h, XFIX_0_541196100

	471 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO W6L.4h <-> ROW2R.4h */

	472 mov v8.16b, v12.16b

	473 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4 h */

	474 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	475 shl v6.4s, v6.4s, #13

	476 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	477 add v2.4s, v6.4s, v4.4s

	478 mov v10.16b, v14.16b

	479 add v2.4s, v2.4s, v12.4s

	480 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4 h */

	481 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	482 shrn ROW1L.4h, v2.4s, #16

	483 sub v2.4s, v2.4s, v12.4s

	484 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO W5L.4h <-> ROW1R.4h */

	485 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	486 sub v2.4s, v2.4s, v12.4s

	487 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	488 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */

	489 sub v6.4s, v6.4s, v4.4s

	490 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	491 add v2.4s, v6.4s, v10.4s

	492 sub v6.4s, v6.4s, v10.4s

	493 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

	494 shrn ROW2L.4h, v2.4s, #16

	495 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	496 shl v10.4s, v10.4s, #13

	497 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO W7L.4h <-> ROW3R.4h */

	498 add v4.4s, v10.4s, v12.4s

	499 sub v2.4s, v10.4s, v12.4s

	500 add v12.4s, v4.4s, v14.4s

	501 sub v4.4s, v4.4s, v14.4s

	502 add v10.4s, v2.4s, v8.4s

	503 sub v6.4s, v2.4s, v8.4s

	504 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	505 shrn ROW3L.4h, v10.4s, #16

	506 shrn ROW0L.4h, v12.4s, #16

	507 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	508 /* 1-D IDCT, pass 2, right 4x8 half */

	509 ld1 {v2.4h}, [x15] /* reload constants */

	510 smull v12.4s, ROW5R.4h, XFIX_1_175875602

	511 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */

	512 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560

	513 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */

	514 smull v14.4s, ROW7R.4h, XFIX_1_175875602

	515 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */

	516 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644

	517 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */

	518 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

	519 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */

	520 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

	521 mov v8.16b, v12.16b

	522 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

	523 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO W7L.4h <-> ROW3R.4h */

	524 shl v6.4s, v6.4s, #13

	525 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4 h */

	526 add v2.4s, v6.4s, v4.4s

	527 mov v10.16b, v14.16b

	528 add v2.4s, v2.4s, v12.4s

	529 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

	530 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO W5L.4h <-> ROW1R.4h */

	531 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	532 sub v2.4s, v2.4s, v12.4s

	533 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

	534 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4 h */

	535 sub v2.4s, v2.4s, v12.4s

	536 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW 6L.4h <-> ROW2R.4h */

	537 smlal v12.4s, ROW6R.4h, XFIX_0_541196100

	538 sub v6.4s, v6.4s, v4.4s

	539 shrn ROW6R.4h, v2.4s, #16

	540 add v2.4s, v6.4s, v10.4s

	541 sub v6.4s, v6.4s, v10.4s

	542 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

	543 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	544 shrn ROW5R.4h, v6.4s, #16

	545 shl v10.4s, v10.4s, #13

	546 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

	547 add v4.4s, v10.4s, v12.4s

	548 sub v2.4s, v10.4s, v12.4s

	549 add v12.4s, v4.4s, v14.4s

	550 sub v4.4s, v4.4s, v14.4s

	551 add v10.4s, v2.4s, v8.4s

	552 sub v6.4s, v2.4s, v8.4s

	553 shrn ROW7R.4h, v4.4s, #16

	554 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	555 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	556 shrn ROW4R.4h, v6.4s, #16

	557

	558 2: /* Descale to 8-bit and range limit */

	559 ins v16.2d[1], v17.2d[0]

	560 ins v18.2d[1], v19.2d[0]

	561 ins v20.2d[1], v21.2d[0]

	562 ins v22.2d[1], v23.2d[0]

	563 sqrshrn v16.8b, v16.8h, #2

	564 sqrshrn2 v16.16b, v18.8h, #2

	565 sqrshrn v18.8b, v20.8h, #2

	566 sqrshrn2 v18.16b, v22.8h, #2

	567

	568 /* vpop {v8.4h - d15.4h} / / restore NEON registers */

	569 ld1 {v8.4h - v11.4h}, [sp], 32

	570 ld1 {v12.4h - v15.4h}, [sp], 32

	571 ins v24.2d[1], v25.2d[0]

	572

	573 sqrshrn v20.8b, v24.8h, #2

	574 /* Transpose the final 8-bit samples and do signed->unsigned conversion */

	575 /* trn1 v16.8h, v16.8h, v18.8h */

	576 transpose v16, v18, v3, .16b, .8h

	577 ins v26.2d[1], v27.2d[0]

	578 ins v28.2d[1], v29.2d[0]

	579 ins v30.2d[1], v31.2d[0]

	580 sqrshrn2 v20.16b, v26.8h, #2

	581 sqrshrn v22.8b, v28.8h, #2

	582 movi v0.16b, #(CENTERJSAMPLE)

	583 sqrshrn2 v22.16b, v30.8h, #2

	584 transpose_single v16, v17, v3, .2d, .8b

	585 transpose_single v18, v19, v3, .2d, .8b

	586 add v16.8b, v16.8b, v0.8b

	587 add v17.8b, v17.8b, v0.8b

	588 add v18.8b, v18.8b, v0.8b

	589 add v19.8b, v19.8b, v0.8b

	590 transpose v20, v22, v3, .16b, .8h

	591 /* Store results to the output buffer */

	592 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	593 add TMP1, TMP1, OUTPUT_COL

	594 add TMP2, TMP2, OUTPUT_COL

	595 st1 {v16.8b}, [TMP1]

	596 transpose_single v20, v21, v3, .2d, .8b

	597 st1 {v17.8b}, [TMP2]

	598 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	599 add TMP1, TMP1, OUTPUT_COL

	600 add TMP2, TMP2, OUTPUT_COL

	601 st1 {v18.8b}, [TMP1]

	602 add v20.8b, v20.8b, v0.8b

	603 add v21.8b, v21.8b, v0.8b

	604 st1 {v19.8b}, [TMP2]

	605 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	606 ldp TMP3, TMP4, [OUTPUT_BUF]

	607 add TMP1, TMP1, OUTPUT_COL

	608 add TMP2, TMP2, OUTPUT_COL

	609 add TMP3, TMP3, OUTPUT_COL

	610 add TMP4, TMP4, OUTPUT_COL

	611 transpose_single v22, v23, v3, .2d, .8b

	612 st1 {v20.8b}, [TMP1]

	613 add v22.8b, v22.8b, v0.8b

	614 add v23.8b, v23.8b, v0.8b

	615 st1 {v21.8b}, [TMP2]

	616 st1 {v22.8b}, [TMP3]

	617 st1 {v23.8b}, [TMP4]

	618 ldr x15, [sp], 16

	619 ld1 {v0.8b - v3.8b}, [sp], 32

	620 ld1 {v4.8b - v7.8b}, [sp], 32

	621 ld1 {v8.8b - v11.8b}, [sp], 32

	622 ld1 {v12.8b - v15.8b}, [sp], 32

	623 ld1 {v16.8b - v19.8b}, [sp], 32

	624 ld1 {v20.8b - v23.8b}, [sp], 32

	625 ld1 {v24.8b - v27.8b}, [sp], 32

	626 ld1 {v28.8b - v31.8b}, [sp], 32

	627 blr x30

	628

	629 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */

	630

	631 /* Transpose left 4x8 half */

	632 transpose ROW6L, ROW7L, v3, .16b, .4h

	633 transpose ROW2L, ROW3L, v3, .16b, .4h

	634 transpose ROW0L, ROW1L, v3, .16b, .4h

	635 transpose ROW4L, ROW5L, v3, .16b, .4h

	636 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */

	637 transpose ROW1L, ROW3L, v3, .16b, .2s

	638 transpose ROW4L, ROW6L, v3, .16b, .2s

	639 transpose ROW0L, ROW2L, v3, .16b, .2s

	640 transpose ROW5L, ROW7L, v3, .16b, .2s

	641 cmp x0, #0

	642 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */

	643

	644 /* Only row 0 is non-zero for the right 4x8 half */

	645 dup ROW1R.4h, ROW0R.4h[1]

	646 dup ROW2R.4h, ROW0R.4h[2]

	647 dup ROW3R.4h, ROW0R.4h[3]

	648 dup ROW4R.4h, ROW0R.4h[0]

	649 dup ROW5R.4h, ROW0R.4h[1]

	650 dup ROW6R.4h, ROW0R.4h[2]

	651 dup ROW7R.4h, ROW0R.4h[3]

	652 dup ROW0R.4h, ROW0R.4h[0]

	653 b 1b /* Go to 'normal' second pass */

	654

	655 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */

	656 ld1 {v2.4h}, [x15] /* reload constants */

	657 smull v12.4s, ROW1L.4h, XFIX_1_175875602

	658 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560

	659 smull v14.4s, ROW3L.4h, XFIX_1_175875602

	660 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644

	661 smull v4.4s, ROW2L.4h, XFIX_0_541196100

	662 sshll v6.4s, ROW0L.4h, #13

	663 mov v8.16b, v12.16b

	664 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	665 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	666 add v2.4s, v6.4s, v4.4s

	667 mov v10.16b, v14.16b

	668 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	669 add v2.4s, v2.4s, v12.4s

	670 add v12.4s, v12.4s, v12.4s

	671 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	672 shrn ROW1L.4h, v2.4s, #16

	673 sub v2.4s, v2.4s, v12.4s

	674 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	675 sub v6.4s, v6.4s, v4.4s

	676 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	677 add v2.4s, v6.4s, v10.4s

	678 sub v6.4s, v6.4s, v10.4s

	679 sshll v10.4s, ROW0L.4h, #13

	680 shrn ROW2L.4h, v2.4s, #16

	681 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	682 add v4.4s, v10.4s, v12.4s

	683 sub v2.4s, v10.4s, v12.4s

	684 add v12.4s, v4.4s, v14.4s

	685 sub v4.4s, v4.4s, v14.4s

	686 add v10.4s, v2.4s, v8.4s

	687 sub v6.4s, v2.4s, v8.4s

	688 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	689 shrn ROW3L.4h, v10.4s, #16

	690 shrn ROW0L.4h, v12.4s, #16

	691 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	692 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */

	693 ld1 {v2.4h}, [x15] /* reload constants */

	694 smull v12.4s, ROW5L.4h, XFIX_1_175875602

	695 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560

	696 smull v14.4s, ROW7L.4h, XFIX_1_175875602

	697 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644

	698 smull v4.4s, ROW6L.4h, XFIX_0_541196100

	699 sshll v6.4s, ROW4L.4h, #13

	700 mov v8.16b, v12.16b

	701 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447

	702 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223

	703 add v2.4s, v6.4s, v4.4s

	704 mov v10.16b, v14.16b

	705 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223

	706 add v2.4s, v2.4s, v12.4s

	707 add v12.4s, v12.4s, v12.4s

	708 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447

	709 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	710 sub v2.4s, v2.4s, v12.4s

	711 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865

	712 sub v6.4s, v6.4s, v4.4s

	713 shrn ROW6R.4h, v2.4s, #16

	714 add v2.4s, v6.4s, v10.4s

	715 sub v6.4s, v6.4s, v10.4s

	716 sshll v10.4s, ROW4L.4h, #13

	717 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	718 shrn ROW5R.4h, v6.4s, #16

	719 add v4.4s, v10.4s, v12.4s

	720 sub v2.4s, v10.4s, v12.4s

	721 add v12.4s, v4.4s, v14.4s

	722 sub v4.4s, v4.4s, v14.4s

	723 add v10.4s, v2.4s, v8.4s

	724 sub v6.4s, v2.4s, v8.4s

	725 shrn ROW7R.4h, v4.4s, #16

	726 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	727 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	728 shrn ROW4R.4h, v6.4s, #16

	729 b 2b /* Go to epilogue */

	730

	731 .unreq DCT_TABLE

	732 .unreq COEF_BLOCK

	733 .unreq OUTPUT_BUF

	734 .unreq OUTPUT_COL

	735 .unreq TMP1

	736 .unreq TMP2

	737 .unreq TMP3

	738 .unreq TMP4

	739

	740 .unreq ROW0L

	741 .unreq ROW0R

	742 .unreq ROW1L

	743 .unreq ROW1R

	744 .unreq ROW2L

	745 .unreq ROW2R

	746 .unreq ROW3L

	747 .unreq ROW3R

	748 .unreq ROW4L

	749 .unreq ROW4R

	750 .unreq ROW5L

	751 .unreq ROW5R

	752 .unreq ROW6L

	753 .unreq ROW6R

	754 .unreq ROW7L

	755 .unreq ROW7R

	756 .endfunc

	757

	758

	759 /*****************************************************************************/

	760

	761 /*

	762 * jsimd_idct_ifast_neon

	763 *

	764 * This function contains a fast, not so accurate integer implementation of

	765 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations

	766 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'

	767 * function from jidctfst.c

	768 *

	769 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.

	770 * But in ARM NEON case some extra additions are required because VQDMULH

	771 * instruction can't handle the constants larger than 1. So the expressions

	772 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",

	773 * which introduces an extra addition. Overall, there are 6 extra additions

	774 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.

	775 */

	776

	777 #define XFIX_1_082392200 v0.4h[0]

	778 #define XFIX_1_414213562 v0.4h[1]

	779 #define XFIX_1_847759065 v0.4h[2]

	780 #define XFIX_2_613125930 v0.4h[3]

	781

	782 .balign 16

	783 jsimd_idct_ifast_neon_consts:

	784 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */

	785 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */

	786 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */

	787 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */

	788

	789 asm_function jsimd_idct_ifast_neon

	790

	791 DCT_TABLE .req x0

	792 COEF_BLOCK .req x1

	793 OUTPUT_BUF .req x2

	794 OUTPUT_COL .req x3

	795 TMP1 .req x0

	796 TMP2 .req x1

	797 TMP3 .req x2

	798 TMP4 .req x22

	799 TMP5 .req x23

	800

	801 /* Load and dequantize coefficients into NEON registers

	802 * with the following allocation:

	803 * 0 1 2 3 \| 4 5 6 7

	804 * ---------+--------

	805 * 0 \| d16 \| d17 ( v8.8h )

	806 * 1 \| d18 \| d19 ( v9.8h )

	807 * 2 \| d20 \| d21 ( v10.8h )

	808 * 3 \| d22 \| d23 ( v11.8h )

	809 * 4 \| d24 \| d25 ( v12.8h )

	810 * 5 \| d26 \| d27 ( v13.8h )

	811 * 6 \| d28 \| d29 ( v14.8h )

	812 * 7 \| d30 \| d31 ( v15.8h )

	813 */

	814 /* Save NEON registers used in fast IDCT */

	815 sub sp, sp, #176

	816 stp x22, x23, [sp], 16

	817 adr x23, jsimd_idct_ifast_neon_consts

	818 st1 {v0.8b - v3.8b}, [sp], 32

	819 st1 {v4.8b - v7.8b}, [sp], 32

	820 st1 {v8.8b - v11.8b}, [sp], 32

	821 st1 {v12.8b - v15.8b}, [sp], 32

	822 st1 {v16.8b - v19.8b}, [sp], 32

	823 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32

	824 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

	825 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32

	826 mul v8.8h, v8.8h, v0.8h

	827 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

	828 mul v9.8h, v9.8h, v1.8h

	829 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32

	830 mul v10.8h, v10.8h, v2.8h

	831 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

	832 mul v11.8h, v11.8h, v3.8h

	833 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32

	834 mul v12.8h, v12.8h, v0.8h

	835 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

	836 mul v14.8h, v14.8h, v2.8h

	837 mul v13.8h, v13.8h, v1.8h

	838 ld1 {v0.4h}, [x23] /* load constants */

	839 mul v15.8h, v15.8h, v3.8h

	840

	841 /* 1-D IDCT, pass 1 */

	842 sub v2.8h, v10.8h, v14.8h

	843 add v14.8h, v10.8h, v14.8h

	844 sub v1.8h, v11.8h, v13.8h

	845 add v13.8h, v11.8h, v13.8h

	846 sub v5.8h, v9.8h, v15.8h

	847 add v15.8h, v9.8h, v15.8h

	848 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

	849 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

	850 add v3.8h, v1.8h, v1.8h

	851 sub v1.8h, v5.8h, v1.8h

	852 add v10.8h, v2.8h, v4.8h

	853 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

	854 sub v2.8h, v15.8h, v13.8h

	855 add v3.8h, v3.8h, v6.8h

	856 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

	857 add v1.8h, v1.8h, v4.8h

	858 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

	859 sub v10.8h, v10.8h, v14.8h

	860 add v2.8h, v2.8h, v6.8h

	861 sub v6.8h, v8.8h, v12.8h

	862 add v12.8h, v8.8h, v12.8h

	863 add v9.8h, v5.8h, v4.8h

	864 add v5.8h, v6.8h, v10.8h

	865 sub v10.8h, v6.8h, v10.8h

	866 add v6.8h, v15.8h, v13.8h

	867 add v8.8h, v12.8h, v14.8h

	868 sub v3.8h, v6.8h, v3.8h

	869 sub v12.8h, v12.8h, v14.8h

	870 sub v3.8h, v3.8h, v1.8h

	871 sub v1.8h, v9.8h, v1.8h

	872 add v2.8h, v3.8h, v2.8h

	873 sub v15.8h, v8.8h, v6.8h

	874 add v1.8h, v1.8h, v2.8h

	875 add v8.8h, v8.8h, v6.8h

	876 add v14.8h, v5.8h, v3.8h

	877 sub v9.8h, v5.8h, v3.8h

	878 sub v13.8h, v10.8h, v2.8h

	879 add v10.8h, v10.8h, v2.8h

	880 /* Transpose q8-q9 */

	881 mov v18.16b, v8.16b

	882 trn1 v8.8h, v8.8h, v9.8h

	883 trn2 v9.8h, v18.8h, v9.8h

	884 sub v11.8h, v12.8h, v1.8h

	885 /* Transpose q14-q15 */

	886 mov v18.16b, v14.16b

	887 trn1 v14.8h, v14.8h, v15.8h

	888 trn2 v15.8h, v18.8h, v15.8h

	889 add v12.8h, v12.8h, v1.8h

	890 /* Transpose q10-q11 */

	891 mov v18.16b, v10.16b

	892 trn1 v10.8h, v10.8h, v11.8h

	893 trn2 v11.8h, v18.8h, v11.8h

	894 /* Transpose q12-q13 */

	895 mov v18.16b, v12.16b

	896 trn1 v12.8h, v12.8h, v13.8h

	897 trn2 v13.8h, v18.8h, v13.8h

	898 /* Transpose q9-q11 */

	899 mov v18.16b, v9.16b

	900 trn1 v9.4s, v9.4s, v11.4s

	901 trn2 v11.4s, v18.4s, v11.4s

	902 /* Transpose q12-q14 */

	903 mov v18.16b, v12.16b

	904 trn1 v12.4s, v12.4s, v14.4s

	905 trn2 v14.4s, v18.4s, v14.4s

	906 /* Transpose q8-q10 */

	907 mov v18.16b, v8.16b

	908 trn1 v8.4s, v8.4s, v10.4s

	909 trn2 v10.4s, v18.4s, v10.4s

	910 /* Transpose q13-q15 */

	911 mov v18.16b, v13.16b

	912 trn1 v13.4s, v13.4s, v15.4s

	913 trn2 v15.4s, v18.4s, v15.4s

	914 /* vswp v14.4h, v10-MSB.4h */

	915 umov x22, v14.d[0]

	916 ins v14.2d[0], v10.2d[1]

	917 ins v10.2d[1], x22

	918 /* vswp v13.4h, v9MSB.4h */

	919

	920 umov x22, v13.d[0]

	921 ins v13.2d[0], v9.2d[1]

	922 ins v9.2d[1], x22

	923 /* 1-D IDCT, pass 2 */

	924 sub v2.8h, v10.8h, v14.8h

	925 /* vswp v15.4h, v11MSB.4h */

	926 umov x22, v15.d[0]

	927 ins v15.2d[0], v11.2d[1]

	928 ins v11.2d[1], x22

	929 add v14.8h, v10.8h, v14.8h

	930 /* vswp v12.4h, v8-MSB.4h */

	931 umov x22, v12.d[0]

	932 ins v12.2d[0], v8.2d[1]

	933 ins v8.2d[1], x22

	934 sub v1.8h, v11.8h, v13.8h

	935 add v13.8h, v11.8h, v13.8h

	936 sub v5.8h, v9.8h, v15.8h

	937 add v15.8h, v9.8h, v15.8h

	938 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

	939 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

	940 add v3.8h, v1.8h, v1.8h

	941 sub v1.8h, v5.8h, v1.8h

	942 add v10.8h, v2.8h, v4.8h

	943 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

	944 sub v2.8h, v15.8h, v13.8h

	945 add v3.8h, v3.8h, v6.8h

	946 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

	947 add v1.8h, v1.8h, v4.8h

	948 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

	949 sub v10.8h, v10.8h, v14.8h

	950 add v2.8h, v2.8h, v6.8h

	951 sub v6.8h, v8.8h, v12.8h

	952 add v12.8h, v8.8h, v12.8h

	953 add v9.8h, v5.8h, v4.8h

	954 add v5.8h, v6.8h, v10.8h

	955 sub v10.8h, v6.8h, v10.8h

	956 add v6.8h, v15.8h, v13.8h

	957 add v8.8h, v12.8h, v14.8h

	958 sub v3.8h, v6.8h, v3.8h

	959 sub v12.8h, v12.8h, v14.8h

	960 sub v3.8h, v3.8h, v1.8h

	961 sub v1.8h, v9.8h, v1.8h

	962 add v2.8h, v3.8h, v2.8h

	963 sub v15.8h, v8.8h, v6.8h

	964 add v1.8h, v1.8h, v2.8h

	965 add v8.8h, v8.8h, v6.8h

	966 add v14.8h, v5.8h, v3.8h

	967 sub v9.8h, v5.8h, v3.8h

	968 sub v13.8h, v10.8h, v2.8h

	969 add v10.8h, v10.8h, v2.8h

	970 sub v11.8h, v12.8h, v1.8h

	971 add v12.8h, v12.8h, v1.8h

	972 /* Descale to 8-bit and range limit */

	973 movi v0.16b, #0x80

	974 sqshrn v8.8b, v8.8h, #5

	975 sqshrn2 v8.16b, v9.8h, #5

	976 sqshrn v9.8b, v10.8h, #5

	977 sqshrn2 v9.16b, v11.8h, #5

	978 sqshrn v10.8b, v12.8h, #5

	979 sqshrn2 v10.16b, v13.8h, #5

	980 sqshrn v11.8b, v14.8h, #5

	981 sqshrn2 v11.16b, v15.8h, #5

	982 add v8.16b, v8.16b, v0.16b

	983 add v9.16b, v9.16b, v0.16b

	984 add v10.16b, v10.16b, v0.16b

	985 add v11.16b, v11.16b, v0.16b

	986 /* Transpose the final 8-bit samples */

	987 /* Transpose q8-q9 */

	988 mov v18.16b, v8.16b

	989 trn1 v8.8h, v8.8h, v9.8h

	990 trn2 v9.8h, v18.8h, v9.8h

	991 /* Transpose q10-q11 */

	992 mov v18.16b, v10.16b

	993 trn1 v10.8h, v10.8h, v11.8h

	994 trn2 v11.8h, v18.8h, v11.8h

	995 /* Transpose q8-q10 */

	996 mov v18.16b, v8.16b

	997 trn1 v8.4s, v8.4s, v10.4s

	998 trn2 v10.4s, v18.4s, v10.4s

	999 /* Transpose q9-q11 */

	1000 mov v18.16b, v9.16b

	1001 trn1 v9.4s, v9.4s, v11.4s

	1002 trn2 v11.4s, v18.4s, v11.4s

	1003 /* make copy */

	1004 ins v17.2d[0], v8.2d[1]

	1005 /* Transpose d16-d17-msb */

	1006 mov v18.16b, v8.16b

	1007 trn1 v8.8b, v8.8b, v17.8b

	1008 trn2 v17.8b, v18.8b, v17.8b

	1009 /* make copy */

	1010 ins v19.2d[0], v9.2d[1]

	1011 mov v18.16b, v9.16b

	1012 trn1 v9.8b, v9.8b, v19.8b

	1013 trn2 v19.8b, v18.8b, v19.8b

	1014 /* Store results to the output buffer */

	1015 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1016 add TMP1, TMP1, OUTPUT_COL

	1017 add TMP2, TMP2, OUTPUT_COL

	1018 st1 {v8.8b}, [TMP1]

	1019 st1 {v17.8b}, [TMP2]

	1020 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1021 add TMP1, TMP1, OUTPUT_COL

	1022 add TMP2, TMP2, OUTPUT_COL

	1023 st1 {v9.8b}, [TMP1]

	1024 /* make copy */

	1025 ins v7.2d[0], v10.2d[1]

	1026 mov v18.16b, v10.16b

	1027 trn1 v10.8b, v10.8b, v7.8b

	1028 trn2 v7.8b, v18.8b, v7.8b

	1029 st1 {v19.8b}, [TMP2]

	1030 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1031 ldp TMP4, TMP5, [OUTPUT_BUF], 16

	1032 add TMP1, TMP1, OUTPUT_COL

	1033 add TMP2, TMP2, OUTPUT_COL

	1034 add TMP4, TMP4, OUTPUT_COL

	1035 add TMP5, TMP5, OUTPUT_COL

	1036 st1 {v10.8b}, [TMP1]

	1037 /* make copy */

	1038 ins v16.2d[0], v11.2d[1]

	1039 mov v18.16b, v11.16b

	1040 trn1 v11.8b, v11.8b, v16.8b

	1041 trn2 v16.8b, v18.8b, v16.8b

	1042 st1 {v7.8b}, [TMP2]

	1043 st1 {v11.8b}, [TMP4]

	1044 st1 {v16.8b}, [TMP5]

	1045 sub sp, sp, #176

	1046 ldp x22, x23, [sp], 16

	1047 ld1 {v0.8b - v3.8b}, [sp], 32

	1048 ld1 {v4.8b - v7.8b}, [sp], 32

	1049 ld1 {v8.8b - v11.8b}, [sp], 32

	1050 ld1 {v12.8b - v15.8b}, [sp], 32

	1051 ld1 {v16.8b - v19.8b}, [sp], 32

	1052 blr x30

	1053

	1054 .unreq DCT_TABLE

	1055 .unreq COEF_BLOCK

	1056 .unreq OUTPUT_BUF

	1057 .unreq OUTPUT_COL

	1058 .unreq TMP1

	1059 .unreq TMP2

	1060 .unreq TMP3

	1061 .unreq TMP4

	1062 .endfunc

	1063

	1064

	1065 /*****************************************************************************/

	1066

	1067 /*

	1068 * jsimd_idct_4x4_neon

	1069 *

	1070 * This function contains inverse-DCT code for getting reduced-size

	1071 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations

	1072 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'

	1073 * function from jpeg-6b (jidctred.c).

	1074 *

	1075 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which

	1076 * requires much less arithmetic operations and hence should be faster.

	1077 * The primary purpose of this particular NEON optimized function is

	1078 * bit exact compatibility with jpeg-6b.

	1079 *

	1080 * TODO: a bit better instructions scheduling can be achieved by expanding

	1081 * idct_helper/transpose_4x4 macros and reordering instructions,

	1082 * but readability will suffer somewhat.

	1083 */

	1084

	1085 #define CONST_BITS 13

	1086

	1087 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */

	1088 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */

	1089 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */

	1090 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */

	1091 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */

	1092 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */

	1093 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */

	1094 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */

	1095 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */

	1096 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */

	1097 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */

	1098 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */

	1099 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */

	1100 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */

	1101

	1102 .balign 16

	1103 jsimd_idct_4x4_neon_consts:

	1104 .short FIX_1_847759065 /* v0.4h[0] */

	1105 .short -FIX_0_765366865 /* v0.4h[1] */

	1106 .short -FIX_0_211164243 /* v0.4h[2] */

	1107 .short FIX_1_451774981 /* v0.4h[3] */

	1108 .short -FIX_2_172734803 /* d1[0] */

	1109 .short FIX_1_061594337 /* d1[1] */

	1110 .short -FIX_0_509795579 /* d1[2] */

	1111 .short -FIX_0_601344887 /* d1[3] */

	1112 .short FIX_0_899976223 /* v2.4h[0] */

	1113 .short FIX_2_562915447 /* v2.4h[1] */

	1114 .short 1 << (CONST_BITS+1) /* v2.4h[2] */

	1115 .short 0 /* v2.4h[3] */

	1116

	1117 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29

	1118 smull v28.4s, \x4, v2.4h[2]

	1119 smlal v28.4s, \x8, v0.4h[0]

	1120 smlal v28.4s, \x14, v0.4h[1]

	1121

	1122 smull v26.4s, \x16, v1.4h[2]

	1123 smlal v26.4s, \x12, v1.4h[3]

	1124 smlal v26.4s, \x10, v2.4h[0]

	1125 smlal v26.4s, \x6, v2.4h[1]

	1126

	1127 smull v30.4s, \x4, v2.4h[2]

	1128 smlsl v30.4s, \x8, v0.4h[0]

	1129 smlsl v30.4s, \x14, v0.4h[1]

	1130

	1131 smull v24.4s, \x16, v0.4h[2]

	1132 smlal v24.4s, \x12, v0.4h[3]

	1133 smlal v24.4s, \x10, v1.4h[0]

	1134 smlal v24.4s, \x6, v1.4h[1]

	1135

	1136 add v20.4s, v28.4s, v26.4s

	1137 sub v28.4s, v28.4s, v26.4s

	1138

	1139 .if \shift > 16

	1140 srshr v20.4s, v20.4s, #\shift

	1141 srshr v28.4s, v28.4s, #\shift

	1142 xtn \y26, v20.4s

	1143 xtn \y29, v28.4s

	1144 .else

	1145 rshrn \y26, v20.4s, #\shift

	1146 rshrn \y29, v28.4s, #\shift

	1147 .endif

	1148

	1149 add v20.4s, v30.4s, v24.4s

	1150 sub v30.4s, v30.4s, v24.4s

	1151

	1152 .if \shift > 16

	1153 srshr v20.4s, v20.4s, #\shift

	1154 srshr v30.4s, v30.4s, #\shift

	1155 xtn \y27, v20.4s

	1156 xtn \y28, v30.4s

	1157 .else

	1158 rshrn \y27, v20.4s, #\shift

	1159 rshrn \y28, v30.4s, #\shift

	1160 .endif

	1161

	1162 .endm

	1163

	1164 asm_function jsimd_idct_4x4_neon

	1165

	1166 DCT_TABLE .req x0

	1167 COEF_BLOCK .req x1

	1168 OUTPUT_BUF .req x2

	1169 OUTPUT_COL .req x3

	1170 TMP1 .req x0

	1171 TMP2 .req x1

	1172 TMP3 .req x2

	1173 TMP4 .req x15

	1174

	1175 /* Save all used NEON registers */

	1176 sub sp, sp, 272

	1177 str x15, [sp], 16

	1178 /* Load constants (v3.4h is just used for padding) */

	1179 adr TMP4, jsimd_idct_4x4_neon_consts

	1180 st1 {v0.8b - v3.8b}, [sp], 32

	1181 st1 {v4.8b - v7.8b}, [sp], 32

	1182 st1 {v8.8b - v11.8b}, [sp], 32

	1183 st1 {v12.8b - v15.8b}, [sp], 32

	1184 st1 {v16.8b - v19.8b}, [sp], 32

	1185 st1 {v20.8b - v23.8b}, [sp], 32

	1186 st1 {v24.8b - v27.8b}, [sp], 32

	1187 st1 {v28.8b - v31.8b}, [sp], 32

	1188 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]

	1189

	1190 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	1191 * 0 1 2 3 \| 4 5 6 7

	1192 * ---------+--------

	1193 * 0 \| v4.4h \| v5.4h

	1194 * 1 \| v6.4h \| v7.4h

	1195 * 2 \| v8.4h \| v9.4h

	1196 * 3 \| v10.4h \| v11.4h

	1197 * 4 \| - \| -

	1198 * 5 \| v12.4h \| v13.4h

	1199 * 6 \| v14.4h \| v15.4h

	1200 * 7 \| v16.4h \| v17.4h

	1201 */

	1202 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

	1203 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32

	1204 add COEF_BLOCK, COEF_BLOCK, #16

	1205 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32

	1206 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

	1207 /* dequantize */

	1208 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

	1209 mul v4.4h, v4.4h, v18.4h

	1210 mul v5.4h, v5.4h, v19.4h

	1211 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */

	1212 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32

	1213 mul v6.4h, v6.4h, v20.4h

	1214 mul v7.4h, v7.4h, v21.4h

	1215 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */

	1216 mul v8.4h, v8.4h, v22.4h

	1217 mul v9.4h, v9.4h, v23.4h

	1218 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */

	1219 add DCT_TABLE, DCT_TABLE, #16

	1220 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32

	1221 mul v10.4h, v10.4h, v24.4h

	1222 mul v11.4h, v11.4h, v25.4h

	1223 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */

	1224 mul v12.4h, v12.4h, v26.4h

	1225 mul v13.4h, v13.4h, v27.4h

	1226 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */

	1227 ld1 {v30.8h}, [DCT_TABLE], 16

	1228 mul v14.4h, v14.4h, v28.4h

	1229 mul v15.4h, v15.4h, v29.4h

	1230 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */

	1231 mul v16.4h, v16.4h, v30.4h

	1232 mul v17.4h, v17.4h, v31.4h

	1233 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */

	1234

	1235 /* Pass 1 */

	1236 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h

	1237 transpose_4x4 v4, v6, v8, v10, v3

	1238 ins v10.2d[1], v11.2d[0]

	1239 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h

	1240 transpose_4x4 v5, v7, v9, v11, v3

	1241 ins v10.2d[1], v11.2d[0]

	1242 /* Pass 2 */

	1243 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h

	1244 transpose_4x4 v26, v27, v28, v29, v3

	1245

	1246 /* Range limit */

	1247 movi v30.8h, #0x80

	1248 ins v26.2d[1], v27.2d[0]

	1249 ins v28.2d[1], v29.2d[0]

	1250 add v26.8h, v26.8h, v30.8h

	1251 add v28.8h, v28.8h, v30.8h

	1252 sqxtun v26.8b, v26.8h

	1253 sqxtun v27.8b, v28.8h

	1254

	1255 /* Store results to the output buffer */

	1256 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1257 ldp TMP3, TMP4, [OUTPUT_BUF]

	1258 add TMP1, TMP1, OUTPUT_COL

	1259 add TMP2, TMP2, OUTPUT_COL

	1260 add TMP3, TMP3, OUTPUT_COL

	1261 add TMP4, TMP4, OUTPUT_COL

	1262

	1263 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT

	1264 /* We can use much less instructions on little endian systems if the

	1265 * OS kernel is not configured to trap unaligned memory accesses

	1266 */

	1267 st1 {v26.s}[0], [TMP1], 4

	1268 st1 {v27.s}[0], [TMP3], 4

	1269 st1 {v26.s}[1], [TMP2], 4

	1270 st1 {v27.s}[1], [TMP4], 4

	1271 #else

	1272 st1 {v26.b}[0], [TMP1], 1

	1273 st1 {v27.b}[0], [TMP3], 1

	1274 st1 {v26.b}[1], [TMP1], 1

	1275 st1 {v27.b}[1], [TMP3], 1

	1276 st1 {v26.b}[2], [TMP1], 1

	1277 st1 {v27.b}[2], [TMP3], 1

	1278 st1 {v26.b}[3], [TMP1], 1

	1279 st1 {v27.b}[3], [TMP3], 1

	1280

	1281 st1 {v26.b}[4], [TMP2], 1

	1282 st1 {v27.b}[4], [TMP4], 1

	1283 st1 {v26.b}[5], [TMP2], 1

	1284 st1 {v27.b}[5], [TMP4], 1

	1285 st1 {v26.b}[6], [TMP2], 1

	1286 st1 {v27.b}[6], [TMP4], 1

	1287 st1 {v26.b}[7], [TMP2], 1

	1288 st1 {v27.b}[7], [TMP4], 1

	1289 #endif

	1290

	1291 /* vpop {v8.4h - v15.4h} ;not available */

	1292 sub sp, sp, #272

	1293 ldr x15, [sp], 16

	1294 ld1 {v0.8b - v3.8b}, [sp], 32

	1295 ld1 {v4.8b - v7.8b}, [sp], 32

	1296 ld1 {v8.8b - v11.8b}, [sp], 32

	1297 ld1 {v12.8b - v15.8b}, [sp], 32

	1298 ld1 {v16.8b - v19.8b}, [sp], 32

	1299 ld1 {v20.8b - v23.8b}, [sp], 32

	1300 ld1 {v24.8b - v27.8b}, [sp], 32

	1301 ld1 {v28.8b - v31.8b}, [sp], 32

	1302 blr x30

	1303

	1304 .unreq DCT_TABLE

	1305 .unreq COEF_BLOCK

	1306 .unreq OUTPUT_BUF

	1307 .unreq OUTPUT_COL

	1308 .unreq TMP1

	1309 .unreq TMP2

	1310 .unreq TMP3

	1311 .unreq TMP4

	1312 .endfunc

	1313

	1314 .purgem idct_helper

	1315

	1316

	1317 /*****************************************************************************/

	1318

	1319 /*

	1320 * jsimd_idct_2x2_neon

	1321 *

	1322 * This function contains inverse-DCT code for getting reduced-size

	1323 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations

	1324 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'

	1325 * function from jpeg-6b (jidctred.c).

	1326 *

	1327 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which

	1328 * requires much less arithmetic operations and hence should be faster.

	1329 * The primary purpose of this particular NEON optimized function is

	1330 * bit exact compatibility with jpeg-6b.

	1331 */

	1332

	1333 .balign 8

	1334 jsimd_idct_2x2_neon_consts:

	1335 .short -FIX_0_720959822 /* d0[0] */

	1336 .short FIX_0_850430095 /* d0[1] */

	1337 .short -FIX_1_272758580 /* d0[2] */

	1338 .short FIX_3_624509785 /* d0[3] */

	1339

	1340 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27

	1341 sshll v15.4s, \x4, #15

	1342 smull v26.4s, \x6, v0.4h[3]

	1343 smlal v26.4s, \x10, v0.4h[2]

	1344 smlal v26.4s, \x12, v0.4h[1]

	1345 smlal v26.4s, \x16, v0.4h[0]

	1346

	1347 add v20.4s, v15.4s, v26.4s

	1348 sub v15.4s, v15.4s, v26.4s

	1349

	1350 .if \shift > 16

	1351 srshr v20.4s, v20.4s, #\shift

	1352 srshr v15.4s, v15.4s, #\shift

	1353 xtn \y26, v20.4s

	1354 xtn \y27, v15.4s

	1355 .else

	1356 rshrn \y26, v20.4s, #\shift

	1357 rshrn \y27, v15.4s, #\shift

	1358 .endif

	1359

	1360 .endm

	1361

	1362 asm_function jsimd_idct_2x2_neon

	1363

	1364 DCT_TABLE .req x0

	1365 COEF_BLOCK .req x1

	1366 OUTPUT_BUF .req x2

	1367 OUTPUT_COL .req x3

	1368 TMP1 .req x0

	1369 TMP2 .req x15

	1370

	1371 /* vpush {v8.4h - v15.4h} ; not available */

	1372 sub sp, sp, 208

	1373 str x15, [sp], 16

	1374

	1375 /* Load constants */

	1376 adr TMP2, jsimd_idct_2x2_neon_consts

	1377 st1 {v4.8b - v7.8b}, [sp], 32

	1378 st1 {v8.8b - v11.8b}, [sp], 32

	1379 st1 {v12.8b - v15.8b}, [sp], 32

	1380 st1 {v16.8b - v19.8b}, [sp], 32

	1381 st1 {v21.8b - v22.8b}, [sp], 16

	1382 st1 {v24.8b - v27.8b}, [sp], 32

	1383 st1 {v30.8b - v31.8b}, [sp], 16

	1384 ld1 {v14.4h}, [TMP2]

	1385

	1386 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	1387 * 0 1 2 3 \| 4 5 6 7

	1388 * ---------+--------

	1389 * 0 \| v4.4h \| v5.4h

	1390 * 1 \| v6.4h \| v7.4h

	1391 * 2 \| - \| -

	1392 * 3 \| v10.4h \| v11.4h

	1393 * 4 \| - \| -

	1394 * 5 \| v12.4h \| v13.4h

	1395 * 6 \| - \| -

	1396 * 7 \| v16.4h \| v17.4h

	1397 */

	1398 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

	1399 add COEF_BLOCK, COEF_BLOCK, #16

	1400 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16

	1401 add COEF_BLOCK, COEF_BLOCK, #16

	1402 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16

	1403 add COEF_BLOCK, COEF_BLOCK, #16

	1404 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

	1405 /* Dequantize */

	1406 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

	1407 mul v4.8h, v4.8h, v18.8h

	1408 mul v5.8h, v5.8h, v18.8h

	1409 ins v4.2d[1], v5.2d[0]

	1410 mul v6.8h, v6.8h, v20.8h

	1411 mul v7.8h, v7.8h, v21.8h

	1412 ins v6.2d[1], v7.2d[0]

	1413 add DCT_TABLE, DCT_TABLE, #16

	1414 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16

	1415 mul v10.8h, v10.8h, v24.8h

	1416 mul v11.8h, v11.8h, v25.8h

	1417 ins v10.2d[1], v11.2d[0]

	1418 add DCT_TABLE, DCT_TABLE, #16

	1419 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16

	1420 mul v12.8h, v12.8h, v26.8h

	1421 mul v13.8h, v13.8h, v27.8h

	1422 ins v12.2d[1], v13.2d[0]

	1423 add DCT_TABLE, DCT_TABLE, #16

	1424 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

	1425 mul v16.8h, v16.8h, v30.8h

	1426 mul v17.8h, v17.8h, v31.8h

	1427 ins v16.2d[1], v17.2d[0]

	1428

	1429 /* Pass 1 */

	1430 #if 0

	1431 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h

	1432 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h

	1433 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h

	1434 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h

	1435 #else

	1436 smull v26.4s, v6.4h, v14.4h[3]

	1437 smlal v26.4s, v10.4h, v14.4h[2]

	1438 smlal v26.4s, v12.4h, v14.4h[1]

	1439 smlal v26.4s, v16.4h, v14.4h[0]

	1440 smull v24.4s, v7.4h, v14.4h[3]

	1441 smlal v24.4s, v11.4h, v14.4h[2]

	1442 smlal v24.4s, v13.4h, v14.4h[1]

	1443 smlal v24.4s, v17.4h, v14.4h[0]

	1444 sshll v15.4s, v4.4h, #15

	1445 sshll v30.4s, v5.4h, #15

	1446 add v20.4s, v15.4s, v26.4s

	1447 sub v15.4s, v15.4s, v26.4s

	1448 rshrn v4.4h, v20.4s, #13

	1449 rshrn v6.4h, v15.4s, #13

	1450 add v20.4s, v30.4s, v24.4s

	1451 sub v15.4s, v30.4s, v24.4s

	1452 rshrn v5.4h, v20.4s, #13

	1453 rshrn v7.4h, v15.4s, #13

	1454 transpose v4, v6, v3, .16b, .8h

	1455 transpose v6, v10, v3, .16b, .4s

	1456 #endif

	1457

	1458 /* Pass 2 */

	1459 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h

	1460

	1461 /* Range limit */

	1462 movi v30.8h, #0x80

	1463 ins v26.2d[1], v27.2d[0]

	1464 add v26.8h, v26.8h, v30.8h

	1465 sqxtun v30.8b, v26.8h

	1466 ins v26.2d[0], v30.2d[0]

	1467 sqxtun v27.8b, v26.8h

	1468

	1469 /* Store results to the output buffer */

	1470 ldp TMP1, TMP2, [OUTPUT_BUF]

	1471 add TMP1, TMP1, OUTPUT_COL

	1472 add TMP2, TMP2, OUTPUT_COL

	1473

	1474 st1 {v26.b}[0], [TMP1], 1

	1475 st1 {v27.b}[4], [TMP1], 1

	1476 st1 {v26.b}[1], [TMP2], 1

	1477 st1 {v27.b}[5], [TMP2], 1

	1478

	1479 sub sp, sp, #208

	1480 ldr x15, [sp], 16

	1481 ld1 {v4.8b - v7.8b}, [sp], 32

	1482 ld1 {v8.8b - v11.8b}, [sp], 32

	1483 ld1 {v12.8b - v15.8b}, [sp], 32

	1484 ld1 {v16.8b - v19.8b}, [sp], 32

	1485 ld1 {v21.8b - v22.8b}, [sp], 16

	1486 ld1 {v24.8b - v27.8b}, [sp], 32

	1487 ld1 {v30.8b - v31.8b}, [sp], 16

	1488 blr x30

	1489

	1490 .unreq DCT_TABLE

	1491 .unreq COEF_BLOCK

	1492 .unreq OUTPUT_BUF

	1493 .unreq OUTPUT_COL

	1494 .unreq TMP1

	1495 .unreq TMP2

	1496 .endfunc

	1497

	1498 .purgem idct_helper

	1499

	1500

	1501 /*****************************************************************************/

	1502

	1503 /*

	1504 * jsimd_ycc_extrgb_convert_neon

	1505 * jsimd_ycc_extbgr_convert_neon

	1506 * jsimd_ycc_extrgbx_convert_neon

	1507 * jsimd_ycc_extbgrx_convert_neon

	1508 * jsimd_ycc_extxbgr_convert_neon

	1509 * jsimd_ycc_extxrgb_convert_neon

	1510 *

	1511 * Colorspace conversion YCbCr -> RGB

	1512 */

	1513

	1514

	1515 .macro do_load size

	1516 .if \size == 8

	1517 ld1 {v4.8b}, [U], 8

	1518 ld1 {v5.8b}, [V], 8

	1519 ld1 {v0.8b}, [Y], 8

	1520 prfm PLDL1KEEP, [U, #64]

	1521 prfm PLDL1KEEP, [V, #64]

	1522 prfm PLDL1KEEP, [Y, #64]

	1523 .elseif \size == 4

	1524 ld1 {v4.b}[0], [U]

	1525 ld1 {v4.b}[1], [U]

	1526 ld1 {v4.b}[2], [U]

	1527 ld1 {v4.b}[3], [U]

	1528 ld1 {v5.b}[0], [V]

	1529 ld1 {v5.b}[1], [V], 1

	1530 ld1 {v5.b}[2], [V], 1

	1531 ld1 {v5.b}[3], [V], 1

	1532 ld1 {v0.b}[0], [Y], 1

	1533 ld1 {v0.b}[1], [Y], 1

	1534 ld1 {v0.b}[2], [Y], 1

	1535 ld1 {v0.b}[3], [Y], 1

	1536 .elseif \size == 2

	1537 ld1 {v4.b}[4], [U], 1

	1538 ld1 {v4.b}[5], [U], 1

	1539 ld1 {v5.b}[4], [V], 1

	1540 ld1 {v5.b}[5], [V], 1

	1541 ld1 {v0.b}[4], [Y], 1

	1542 ld1 {v0.b}[5], [Y], 1

	1543 .elseif \size == 1

	1544 ld1 {v4.b}[6], [U], 1

	1545 ld1 {v5.b}[6], [V], 1

	1546 ld1 {v0.b}[6], [Y], 1

	1547 .else

	1548 .error unsupported macroblock size

	1549 .endif

	1550 .endm

	1551

	1552 .macro do_store bpp, size

	1553 .if \bpp == 24

	1554 .if \size == 8

	1555 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24

	1556 .elseif \size == 4

	1557 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3

	1558 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3

	1559 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3

	1560 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3

	1561 .elseif \size == 2

	1562 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3

	1563 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3

	1564 .elseif \size == 1

	1565 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3

	1566 .else

	1567 .error unsupported macroblock size

	1568 .endif

	1569 .elseif \bpp == 32

	1570 .if \size == 8

	1571 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32

	1572 .elseif \size == 4

	1573 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4

	1574 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4

	1575 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4

	1576 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4

	1577 .elseif \size == 2

	1578 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4

	1579 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4

	1580 .elseif \size == 1

	1581 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4

	1582 .else

	1583 .error unsupported macroblock size

	1584 .endif

	1585 .else

	1586 .error unsupported bpp

	1587 .endif

	1588 .endm

	1589

	1590 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize

	1591

	1592 /*

	1593 * 2-stage pipelined YCbCr->RGB conversion

	1594 */

	1595

	1596 .macro do_yuv_to_rgb_stage1

	1597 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */

	1598 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	1599 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	1600 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	1601 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	1602 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	1603 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	1604 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	1605 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */

	1606 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

	1607 .endm

	1608

	1609 .macro do_yuv_to_rgb_stage2

	1610 rshrn v20.4h, v20.4s, #15

	1611 rshrn2 v20.8h, v22.4s, #15

	1612 rshrn v24.4h, v24.4s, #14

	1613 rshrn2 v24.8h, v26.4s, #14

	1614 rshrn v28.4h, v28.4s, #14

	1615 rshrn2 v28.8h, v30.4s, #14

	1616 uaddw v20.8h, v20.8h, v0.8b

	1617 uaddw v24.8h, v24.8h, v0.8b

	1618 uaddw v28.8h, v28.8h, v0.8b

	1619 sqxtun v1\g_offs\defsize, v20.8h

	1620 sqxtun v1\r_offs\defsize, v24.8h

	1621 sqxtun v1\b_offs\defsize, v28.8h

	1622

	1623 .endm

	1624

	1625 .macro do_yuv_to_rgb_stage2_store_load_stage1

	1626 ld1 {v4.8b}, [U], 8

	1627 rshrn v20.4h, v20.4s, #15

	1628 rshrn2 v20.8h, v22.4s, #15

	1629 rshrn v24.4h, v24.4s, #14

	1630 rshrn2 v24.8h, v26.4s, #14

	1631 rshrn v28.4h, v28.4s, #14

	1632 ld1 {v5.8b}, [V], 8

	1633 rshrn2 v28.8h, v30.4s, #14

	1634 uaddw v20.8h, v20.8h, v0.8b

	1635 uaddw v24.8h, v24.8h, v0.8b

	1636 uaddw v28.8h, v28.8h, v0.8b

	1637 sqxtun v1\g_offs\defsize, v20.8h

	1638 ld1 {v0.8b}, [Y], 8

	1639 sqxtun v1\r_offs\defsize, v24.8h

	1640 prfm PLDL1KEEP, [U, #64]

	1641 prfm PLDL1KEEP, [V, #64]

	1642 prfm PLDL1KEEP, [Y, #64]

	1643 sqxtun v1\b_offs\defsize, v28.8h

	1644 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

	1645 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	1646 do_store \bpp, 8

	1647 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	1648 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	1649 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	1650 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	1651 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	1652 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	1653 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */

	1654 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

	1655 .endm

	1656

	1657 .macro do_yuv_to_rgb

	1658 do_yuv_to_rgb_stage1

	1659 do_yuv_to_rgb_stage2

	1660 .endm

	1661

	1662 /* Apple gas crashes on adrl, work around that by using adr.

	1663 * But this requires a copy of these constants for each function.

	1664 */

	1665

	1666 .balign 16

	1667 jsimd_ycc_\colorid\()_neon_consts:

	1668 .short 0, 0, 0, 0

	1669 .short 22971, -11277, -23401, 29033

	1670 .short -128, -128, -128, -128

	1671 .short -128, -128, -128, -128

	1672

	1673 asm_function jsimd_ycc_\colorid\()_convert_neon

	1674 OUTPUT_WIDTH .req x0

	1675 INPUT_BUF .req x1

	1676 INPUT_ROW .req x2

	1677 OUTPUT_BUF .req x3

	1678 NUM_ROWS .req x4

	1679

	1680 INPUT_BUF0 .req x5

	1681 INPUT_BUF1 .req x6

	1682 INPUT_BUF2 .req INPUT_BUF

	1683

	1684 RGB .req x7

	1685 Y .req x8

	1686 U .req x9

	1687 V .req x10

	1688 N .req x15

	1689

	1690 sub sp, sp, 336

	1691 str x15, [sp], 16

	1692 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */

	1693 adr x15, jsimd_ycc_\colorid\()_neon_consts

	1694 /* Save NEON registers */

	1695 st1 {v0.8b - v3.8b}, [sp], 32

	1696 st1 {v4.8b - v7.8b}, [sp], 32

	1697 st1 {v8.8b - v11.8b}, [sp], 32

	1698 st1 {v12.8b - v15.8b}, [sp], 32

	1699 st1 {v16.8b - v19.8b}, [sp], 32

	1700 st1 {v20.8b - v23.8b}, [sp], 32

	1701 st1 {v24.8b - v27.8b}, [sp], 32

	1702 st1 {v28.8b - v31.8b}, [sp], 32

	1703 ld1 {v0.4h, v1.4h}, [x15], 16

	1704 ld1 {v2.8h}, [x15]

	1705

	1706 /* Save ARM registers and handle input arguments */

	1707 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */

	1708 stp x4, x5, [sp], 16

	1709 stp x6, x7, [sp], 16

	1710 stp x8, x9, [sp], 16

	1711 stp x10, x30, [sp], 16

	1712 ldr INPUT_BUF0, [INPUT_BUF]

	1713 ldr INPUT_BUF1, [INPUT_BUF, 8]

	1714 ldr INPUT_BUF2, [INPUT_BUF, 16]

	1715 .unreq INPUT_BUF

	1716

	1717 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */

	1718 movi v10.16b, #255

	1719 movi v12.16b, #255

	1720

	1721 /* Outer loop over scanlines */

	1722 cmp NUM_ROWS, #1

	1723 blt 9f

	1724 0:

	1725 lsl x16, INPUT_ROW, #3

	1726 ldr Y, [INPUT_BUF0, x16]

	1727 ldr U, [INPUT_BUF1, x16]

	1728 mov N, OUTPUT_WIDTH

	1729 ldr V, [INPUT_BUF2, x16]

	1730 add INPUT_ROW, INPUT_ROW, #1

	1731 ldr RGB, [OUTPUT_BUF], #8

	1732

	1733 /* Inner loop over pixels */

	1734 subs N, N, #8

	1735 blt 3f

	1736 do_load 8

	1737 do_yuv_to_rgb_stage1

	1738 subs N, N, #8

	1739 blt 2f

	1740 1:

	1741 do_yuv_to_rgb_stage2_store_load_stage1

	1742 subs N, N, #8

	1743 bge 1b

	1744 2:

	1745 do_yuv_to_rgb_stage2

	1746 do_store \bpp, 8

	1747 tst N, #7

	1748 beq 8f

	1749 3:

	1750 tst N, #4

	1751 beq 3f

	1752 do_load 4

	1753 3:

	1754 tst N, #2

	1755 beq 4f

	1756 do_load 2

	1757 4:

	1758 tst N, #1

	1759 beq 5f

	1760 do_load 1

	1761 5:

	1762 do_yuv_to_rgb

	1763 tst N, #4

	1764 beq 6f

	1765 do_store \bpp, 4

	1766 6:

	1767 tst N, #2

	1768 beq 7f

	1769 do_store \bpp, 2

	1770 7:

	1771 tst N, #1

	1772 beq 8f

	1773 do_store \bpp, 1

	1774 8:

	1775 subs NUM_ROWS, NUM_ROWS, #1

	1776 bgt 0b

	1777 9:

	1778 /* Restore all registers and return */

	1779 sub sp, sp, #336

	1780 ldr x15, [sp], 16

	1781 ld1 {v0.8b - v3.8b}, [sp], 32

	1782 ld1 {v4.8b - v7.8b}, [sp], 32

	1783 ld1 {v8.8b - v11.8b}, [sp], 32

	1784 ld1 {v12.8b - v15.8b}, [sp], 32

	1785 ld1 {v16.8b - v19.8b}, [sp], 32

	1786 ld1 {v20.8b - v23.8b}, [sp], 32

	1787 ld1 {v24.8b - v27.8b}, [sp], 32

	1788 ld1 {v28.8b - v31.8b}, [sp], 32

	1789 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */

	1790 ldp x4, x5, [sp], 16

	1791 ldp x6, x7, [sp], 16

	1792 ldp x8, x9, [sp], 16

	1793 ldp x10, x30, [sp], 16

	1794 br x30

	1795 .unreq OUTPUT_WIDTH

	1796 .unreq INPUT_ROW

	1797 .unreq OUTPUT_BUF

	1798 .unreq NUM_ROWS

	1799 .unreq INPUT_BUF0

	1800 .unreq INPUT_BUF1

	1801 .unreq INPUT_BUF2

	1802 .unreq RGB

	1803 .unreq Y

	1804 .unreq U

	1805 .unreq V

	1806 .unreq N

	1807 .endfunc

	1808

	1809 .purgem do_yuv_to_rgb

	1810 .purgem do_yuv_to_rgb_stage1

	1811 .purgem do_yuv_to_rgb_stage2

	1812 .purgem do_yuv_to_rgb_stage2_store_load_stage1

	1813 .endm

	1814

	1815 /--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize /

	1816 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b

	1817 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b

	1818 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b

	1819 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b

	1820 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b

	1821 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b

	1822

	1823 .purgem do_load

	1824 .purgem do_store

OLD	NEW

« libjpeg.gyp ('K') | « simd/jsimd_arm64.c ('k') | no next file » | no next file with comments »