simd/jsimd_arm64_neon.S - Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo

Side by Side Diff: simd/jsimd_arm64_neon.S

Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo

Patch Set: Add neon fixes Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * ARMv8 NEON optimizations for libjpeg-turbo

	3 *

	4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).

	5 * All rights reserved.

	6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>

	7 * Copyright (C) 2013-2014, Linaro Limited

	8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>

	9 *

	10 * This software is provided 'as-is', without any express or implied

	11 * warranty. In no event will the authors be held liable for any damages

	12 * arising from the use of this software.

	13 *

	14 * Permission is granted to anyone to use this software for any purpose,

	15 * including commercial applications, and to alter it and redistribute it

	16 * freely, subject to the following restrictions:

	17 *

	18 * 1. The origin of this software must not be misrepresented; you must not

	19 * claim that you wrote the original software. If you use this software

	20 * in a product, an acknowledgment in the product documentation would be

	21 * appreciated but is not required.

	22 * 2. Altered source versions must be plainly marked as such, and must not be

	23 * misrepresented as being the original software.

	24 * 3. This notice may not be removed or altered from any source distribution.

	25 */

	26

	27 #if defined(__linux__) && defined(__ELF__)

	28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */

	29 #endif

	30

	31 .text

	32 .arch armv8-a+fp+simd

	33

	34

	35 #define RESPECT_STRICT_ALIGNMENT 1

	36

	37

	38 /*****************************************************************************/

	39

	40 /* Supplementary macro for setting function attributes */

	41 .macro asm_function fname

	42 #ifdef __APPLE__

	43 .globl _\fname

	44 _\fname:

	45 #else

	46 .global \fname

	47 #ifdef __ELF__

	48 .hidden \fname

	49 .type \fname, %function

	50 #endif

	51 \fname:

	52 #endif

	53 .endm

	54

	55 /* Transpose elements of single 128 bit registers */

	56 .macro transpose_single x0,x1,xi,xilen,literal

	57 ins \xi\xilen[0], \x0\xilen[0]

	58 ins \x1\xilen[0], \x0\xilen[1]

	59 trn1 \x0\literal, \x0\literal, \x1\literal

	60 trn2 \x1\literal, \xi\literal, \x1\literal

	61 .endm

	62

	63 /* Transpose elements of 2 differnet registers */

	64 .macro transpose x0,x1,xi,xilen,literal

	65 mov \xi\xilen, \x0\xilen

	66 trn1 \x0\literal, \x0\literal, \x1\literal

	67 trn2 \x1\literal, \xi\literal, \x1\literal

	68 .endm

	69

	70 /* Transpose a block of 4x4 coefficients in four 64-bit registers */

	71 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen

	72 mov \xi\xilen, \x0\xilen

	73 trn1 \x0\x0len, \x0\x0len, \x2\x2len

	74 trn2 \x2\x2len, \xi\x0len, \x2\x2len

	75 mov \xi\xilen, \x1\xilen

	76 trn1 \x1\x1len, \x1\x1len, \x3\x3len

	77 trn2 \x3\x3len, \xi\x1len, \x3\x3len

	78 .endm

	79

	80 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen

	81 mov \xi\xilen, \x0\xilen

	82 trn1 \x0\x0len, \x0\x0len, \x1\x1len

	83 trn2 \x1\x2len, \xi\x0len, \x1\x2len

	84 mov \xi\xilen, \x2\xilen

	85 trn1 \x2\x2len, \x2\x2len, \x3\x3len

	86 trn2 \x3\x2len, \xi\x1len, \x3\x3len

	87 .endm

	88

	89 .macro transpose_4x4 x0, x1, x2, x3,x5

	90 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b

	91 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b

	92 .endm

	93

	94

	95 #define CENTERJSAMPLE 128

	96

	97 /*****************************************************************************/

	98

	99 /*

	100 * Perform dequantization and inverse DCT on one block of coefficients.

	101 *

	102 * GLOBAL(void)

	103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,

	104 * JSAMPARRAY output_buf, JDIMENSION output_col)

	105 */

	106

	107 #define FIX_0_298631336 (2446)

	108 #define FIX_0_390180644 (3196)

	109 #define FIX_0_541196100 (4433)

	110 #define FIX_0_765366865 (6270)

	111 #define FIX_0_899976223 (7373)

	112 #define FIX_1_175875602 (9633)

	113 #define FIX_1_501321110 (12299)

	114 #define FIX_1_847759065 (15137)

	115 #define FIX_1_961570560 (16069)

	116 #define FIX_2_053119869 (16819)

	117 #define FIX_2_562915447 (20995)

	118 #define FIX_3_072711026 (25172)

	119

	120 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)

	121 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)

	122 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)

	123 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)

	124 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)

	125 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)

	126 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)

	127 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)

	128

	129 /*

	130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.

	131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'

	132 */

	133 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \

	134 { \

	135 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \

	136 INT32 q1, q2, q3, q4, q5, q6, q7; \

	137 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \

	138 \

	139 /* 1-D iDCT input data */ \

	140 row0 = xrow0; \

	141 row1 = xrow1; \

	142 row2 = xrow2; \

	143 row3 = xrow3; \

	144 row4 = xrow4; \

	145 row5 = xrow5; \

	146 row6 = xrow6; \

	147 row7 = xrow7; \

	148 \

	149 q5 = row7 + row3; \

	150 q4 = row5 + row1; \

	151 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \

	152 MULTIPLY(q4, FIX_1_175875602); \

	153 q7 = MULTIPLY(q5, FIX_1_175875602) + \

	154 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \

	155 q2 = MULTIPLY(row2, FIX_0_541196100) + \

	156 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \

	157 q4 = q6; \

	158 q3 = ((INT32) row0 - (INT32) row4) << 13; \

	159 q6 += MULTIPLY(row5, -FIX_2_562915447) + \

	160 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \

	161 /* now we can use q1 (reloadable constants have been used up) */ \

	162 q1 = q3 + q2; \

	163 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \

	164 MULTIPLY(row1, -FIX_0_899976223); \

	165 q5 = q7; \

	166 q1 = q1 + q6; \

	167 q7 += MULTIPLY(row7, -FIX_0_899976223) + \

	168 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \

	169 \

	170 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \

	171 tmp11_plus_tmp2 = q1; \

	172 row1 = 0; \

	173 \

	174 q1 = q1 - q6; \

	175 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \

	176 MULTIPLY(row3, -FIX_2_562915447); \

	177 q1 = q1 - q6; \

	178 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \

	179 MULTIPLY(row6, FIX_0_541196100); \

	180 q3 = q3 - q2; \

	181 \

	182 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \

	183 tmp11_minus_tmp2 = q1; \

	184 \

	185 q1 = ((INT32) row0 + (INT32) row4) << 13; \

	186 q2 = q1 + q6; \

	187 q1 = q1 - q6; \

	188 \

	189 /* pick up the results */ \

	190 tmp0 = q4; \

	191 tmp1 = q5; \

	192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \

	193 tmp3 = q7; \

	194 tmp10 = q2; \

	195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \

	196 tmp12 = q3; \

	197 tmp13 = q1; \

	198 }

	199

	200 #define XFIX_0_899976223 v0.4h[0]

	201 #define XFIX_0_541196100 v0.4h[1]

	202 #define XFIX_2_562915447 v0.4h[2]

	203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]

	204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]

	205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]

	206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]

	207 #define XFIX_1_175875602 v1.4h[3]

	208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]

	209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]

	210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]

	211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]

	212

	213 .balign 16

	214 jsimd_idct_islow_neon_consts:

	215 .short FIX_0_899976223 /* d0[0] */

	216 .short FIX_0_541196100 /* d0[1] */

	217 .short FIX_2_562915447 /* d0[2] */

	218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */

	219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */

	220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */

	221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */

	222 .short FIX_1_175875602 /* d1[3] */

	223 /* reloadable constants */

	224 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */

	225 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */

	226 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */

	227 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */

	228

	229 asm_function jsimd_idct_islow_neon

	230

	231 DCT_TABLE .req x0

	232 COEF_BLOCK .req x1

	233 OUTPUT_BUF .req x2

	234 OUTPUT_COL .req x3

	235 TMP1 .req x0

	236 TMP2 .req x1

	237 TMP3 .req x2

	238 TMP4 .req x15

	239

	240 ROW0L .req v16

	241 ROW0R .req v17

	242 ROW1L .req v18

	243 ROW1R .req v19

	244 ROW2L .req v20

	245 ROW2R .req v21

	246 ROW3L .req v22

	247 ROW3R .req v23

	248 ROW4L .req v24

	249 ROW4R .req v25

	250 ROW5L .req v26

	251 ROW5R .req v27

	252 ROW6L .req v28

	253 ROW6R .req v29

	254 ROW7L .req v30

	255 ROW7R .req v31

	256 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */

	257 sub sp, sp, 272

	258 str x15, [sp], 16

	259 adr x15, jsimd_idct_islow_neon_consts

	260 st1 {v0.8b - v3.8b}, [sp], 32

	261 st1 {v4.8b - v7.8b}, [sp], 32

	262 st1 {v8.8b - v11.8b}, [sp], 32

	263 st1 {v12.8b - v15.8b}, [sp], 32

	264 st1 {v16.8b - v19.8b}, [sp], 32

	265 st1 {v20.8b - v23.8b}, [sp], 32

	266 st1 {v24.8b - v27.8b}, [sp], 32

	267 st1 {v28.8b - v31.8b}, [sp], 32

	268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32

	269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32

	270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32

	271 mul v16.4h, v16.4h, v0.4h

	272 mul v17.4h, v17.4h, v1.4h

	273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */

	274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32

	275 mul v18.4h, v18.4h, v2.4h

	276 mul v19.4h, v19.4h, v3.4h

	277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */

	278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32

	279 mul v20.4h, v20.4h, v4.4h

	280 mul v21.4h, v21.4h, v5.4h

	281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */

	282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32

	283 mul v22.4h, v22.4h, v6.4h

	284 mul v23.4h, v23.4h, v7.4h

	285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */

	286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]

	287 mul v24.4h, v24.4h, v0.4h

	288 mul v25.4h, v25.4h, v1.4h

	289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */

	290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32

	291 mul v28.4h, v28.4h, v4.4h

	292 mul v29.4h, v29.4h, v5.4h

	293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */

	294 mul v26.4h, v26.4h, v2.4h

	295 mul v27.4h, v27.4h, v3.4h

	296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */

	297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */

	298 add x15, x15, #16

	299 mul v30.4h, v30.4h, v6.4h

	300 mul v31.4h, v31.4h, v7.4h

	301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */

	302 /* Go to the bottom of the stack */

	303 sub sp, sp, 352

	304 stp x4, x5, [sp], 16

	305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */

	306 st1 {v12.4h - v15.4h}, [sp], 32

	307 /* 1-D IDCT, pass 1, left 4x8 half */

	308 add v4.4h, ROW7L.4h, ROW3L.4h

	309 add v5.4h, ROW5L.4h, ROW1L.4h

	310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560

	311 smlal v12.4s, v5.4h, XFIX_1_175875602

	312 smull v14.4s, v4.4h, XFIX_1_175875602

	313 /* Check for the zero coefficients in the right 4x8 half */

	314 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644

	315 ssubl v6.4s, ROW0L.4h, ROW4L.4h

	316 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]

	317 smull v4.4s, ROW2L.4h, XFIX_0_541196100

	318 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065

	319 orr x0, x4, x5

	320 mov v8.16b, v12.16b

	321 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447

	322 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]

	323 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	324 shl v6.4s, v6.4s, #13

	325 orr x0, x0, x4

	326 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	327 orr x0, x0 , x5

	328 add v2.4s, v6.4s, v4.4s

	329 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]

	330 mov v10.16b, v14.16b

	331 add v2.4s, v2.4s, v12.4s

	332 orr x0, x0, x4

	333 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223

	334 orr x0, x0, x5

	335 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	336 rshrn ROW1L.4h, v2.4s, #11

	337 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]

	338 sub v2.4s, v2.4s, v12.4s

	339 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447

	340 orr x0, x0, x4

	341 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	342 orr x0, x0, x5

	343 sub v2.4s, v2.4s, v12.4s

	344 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	345 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]

	346 smlal v12.4s, ROW6L.4h, XFIX_0_541196100

	347 sub v6.4s, v6.4s, v4.4s

	348 orr x0, x0, x4

	349 rshrn ROW6L.4h, v2.4s, #11

	350 orr x0, x0, x5

	351 add v2.4s, v6.4s, v10.4s

	352 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]

	353 sub v6.4s, v6.4s, v10.4s

	354 saddl v10.4s, ROW0L.4h, ROW4L.4h

	355 orr x0, x0, x4

	356 rshrn ROW2L.4h, v2.4s, #11

	357 orr x0, x0, x5

	358 rshrn ROW5L.4h, v6.4s, #11

	359 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]

	360 shl v10.4s, v10.4s, #13

	361 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223

	362 orr x0, x0, x4

	363 add v4.4s, v10.4s, v12.4s

	364 orr x0, x0, x5

	365 cmp x0, #0 /* orrs instruction removed */

	366 sub v2.4s, v10.4s, v12.4s

	367 add v12.4s, v4.4s, v14.4s

	368 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]

	369 sub v4.4s, v4.4s, v14.4s

	370 add v10.4s, v2.4s, v8.4s

	371 orr x0, x4, x5

	372 sub v6.4s, v2.4s, v8.4s

	373 /* pop {x4, x5} */

	374 sub sp, sp, 80

	375 ldp x4, x5, [sp], 16

	376 rshrn ROW7L.4h, v4.4s, #11

	377 rshrn ROW3L.4h, v10.4s, #11

	378 rshrn ROW0L.4h, v12.4s, #11

	379 rshrn ROW4L.4h, v6.4s, #11

	380

	381 beq 3f /* Go to do some special handling for the sparse right 4x8 half */

	382

	383 /* 1-D IDCT, pass 1, right 4x8 half */

	384 ld1 {v2.4h}, [x15] /* reload constants */

	385 add v10.4h, ROW7R.4h, ROW3R.4h

	386 add v8.4h, ROW5R.4h, ROW1R.4h

	387 /* Transpose ROW6L <-> ROW7L (v3 available free register) */

	388 transpose ROW6L, ROW7L, v3, .16b, .4h

	389 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560

	390 smlal v12.4s, v8.4h, XFIX_1_175875602

	391 /* Transpose ROW2L <-> ROW3L (v3 available free register) */

	392 transpose ROW2L, ROW3L, v3, .16b, .4h

	393 smull v14.4s, v10.4h, XFIX_1_175875602

	394 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644

	395 /* Transpose ROW0L <-> ROW1L (v3 available free register) */

	396 transpose ROW0L, ROW1L, v3, .16b, .4h

	397 ssubl v6.4s, ROW0R.4h, ROW4R.4h

	398 smull v4.4s, ROW2R.4h, XFIX_0_541196100

	399 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

	400 /* Transpose ROW4L <-> ROW5L (v3 available free register) */

	401 transpose ROW4L, ROW5L, v3, .16b, .4h

	402 mov v8.16b, v12.16b

	403 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

	404 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447

	405 /* Transpose ROW1L <-> ROW3L (v3 available free register) */

	406 transpose ROW1L, ROW3L, v3, .16b, .2s

	407 shl v6.4s, v6.4s, #13

	408 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223

	409 /* Transpose ROW4L <-> ROW6L (v3 available free register) */

	410 transpose ROW4L, ROW6L, v3, .16b, .2s

	411 add v2.4s, v6.4s, v4.4s

	412 mov v10.16b, v14.16b

	413 add v2.4s, v2.4s, v12.4s

	414 /* Transpose ROW0L <-> ROW2L (v3 available free register) */

	415 transpose ROW0L, ROW2L, v3, .16b, .2s

	416 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

	417 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223

	418 rshrn ROW1R.4h, v2.4s, #11

	419 /* Transpose ROW5L <-> ROW7L (v3 available free register) */

	420 transpose ROW5L, ROW7L, v3, .16b, .2s

	421 sub v2.4s, v2.4s, v12.4s

	422 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

	423 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447

	424 sub v2.4s, v2.4s, v12.4s

	425 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865

	426 smlal v12.4s, ROW6R.4h, XFIX_0_541196100

	427 sub v6.4s, v6.4s, v4.4s

	428 rshrn ROW6R.4h, v2.4s, #11

	429 add v2.4s, v6.4s, v10.4s

	430 sub v6.4s, v6.4s, v10.4s

	431 saddl v10.4s, ROW0R.4h, ROW4R.4h

	432 rshrn ROW2R.4h, v2.4s, #11

	433 rshrn ROW5R.4h, v6.4s, #11

	434 shl v10.4s, v10.4s, #13

	435 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

	436 add v4.4s, v10.4s, v12.4s

	437 sub v2.4s, v10.4s, v12.4s

	438 add v12.4s, v4.4s, v14.4s

	439 sub v4.4s, v4.4s, v14.4s

	440 add v10.4s, v2.4s, v8.4s

	441 sub v6.4s, v2.4s, v8.4s

	442 rshrn ROW7R.4h, v4.4s, #11

	443 rshrn ROW3R.4h, v10.4s, #11

	444 rshrn ROW0R.4h, v12.4s, #11

	445 rshrn ROW4R.4h, v6.4s, #11

	446 /* Transpose right 4x8 half */

	447 transpose ROW6R, ROW7R, v3, .16b, .4h

	448 transpose ROW2R, ROW3R, v3, .16b, .4h

	449 transpose ROW0R, ROW1R, v3, .16b, .4h

	450 transpose ROW4R, ROW5R, v3, .16b, .4h

	451 transpose ROW1R, ROW3R, v3, .16b, .2s

	452 transpose ROW4R, ROW6R, v3, .16b, .2s

	453 transpose ROW0R, ROW2R, v3, .16b, .2s

	454 transpose ROW5R, ROW7R, v3, .16b, .2s

	455

	456 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */

	457 ld1 {v2.4h}, [x15] /* reload constants */

	458 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */

	459 smlal v12.4s, ROW1L.4h, XFIX_1_175875602

	460 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */

	461 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560

	462 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */

	463 smlal v14.4s, ROW3L.4h, XFIX_1_175875602

	464 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */

	465 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644

	466 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

	467 smull v4.4s, ROW2L.4h, XFIX_0_541196100

	468 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO W6L.4h <-> ROW2R.4h */

	469 mov v8.16b, v12.16b

	470 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4 h */

	471 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	472 shl v6.4s, v6.4s, #13

	473 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	474 add v2.4s, v6.4s, v4.4s

	475 mov v10.16b, v14.16b

	476 add v2.4s, v2.4s, v12.4s

	477 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4 h */

	478 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	479 shrn ROW1L.4h, v2.4s, #16

	480 sub v2.4s, v2.4s, v12.4s

	481 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO W5L.4h <-> ROW1R.4h */

	482 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	483 sub v2.4s, v2.4s, v12.4s

	484 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	485 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */

	486 sub v6.4s, v6.4s, v4.4s

	487 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	488 add v2.4s, v6.4s, v10.4s

	489 sub v6.4s, v6.4s, v10.4s

	490 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

	491 shrn ROW2L.4h, v2.4s, #16

	492 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	493 shl v10.4s, v10.4s, #13

	494 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO W7L.4h <-> ROW3R.4h */

	495 add v4.4s, v10.4s, v12.4s

	496 sub v2.4s, v10.4s, v12.4s

	497 add v12.4s, v4.4s, v14.4s

	498 sub v4.4s, v4.4s, v14.4s

	499 add v10.4s, v2.4s, v8.4s

	500 sub v6.4s, v2.4s, v8.4s

	501 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	502 shrn ROW3L.4h, v10.4s, #16

	503 shrn ROW0L.4h, v12.4s, #16

	504 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	505 /* 1-D IDCT, pass 2, right 4x8 half */

	506 ld1 {v2.4h}, [x15] /* reload constants */

	507 smull v12.4s, ROW5R.4h, XFIX_1_175875602

	508 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */

	509 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560

	510 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */

	511 smull v14.4s, ROW7R.4h, XFIX_1_175875602

	512 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */

	513 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644

	514 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */

	515 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

	516 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */

	517 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

	518 mov v8.16b, v12.16b

	519 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

	520 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO W7L.4h <-> ROW3R.4h */

	521 shl v6.4s, v6.4s, #13

	522 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4 h */

	523 add v2.4s, v6.4s, v4.4s

	524 mov v10.16b, v14.16b

	525 add v2.4s, v2.4s, v12.4s

	526 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

	527 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO W5L.4h <-> ROW1R.4h */

	528 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	529 sub v2.4s, v2.4s, v12.4s

	530 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

	531 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4 h */

	532 sub v2.4s, v2.4s, v12.4s

	533 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW 6L.4h <-> ROW2R.4h */

	534 smlal v12.4s, ROW6R.4h, XFIX_0_541196100

	535 sub v6.4s, v6.4s, v4.4s

	536 shrn ROW6R.4h, v2.4s, #16

	537 add v2.4s, v6.4s, v10.4s

	538 sub v6.4s, v6.4s, v10.4s

	539 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

	540 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	541 shrn ROW5R.4h, v6.4s, #16

	542 shl v10.4s, v10.4s, #13

	543 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

	544 add v4.4s, v10.4s, v12.4s

	545 sub v2.4s, v10.4s, v12.4s

	546 add v12.4s, v4.4s, v14.4s

	547 sub v4.4s, v4.4s, v14.4s

	548 add v10.4s, v2.4s, v8.4s

	549 sub v6.4s, v2.4s, v8.4s

	550 shrn ROW7R.4h, v4.4s, #16

	551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	553 shrn ROW4R.4h, v6.4s, #16

	554

	555 2: /* Descale to 8-bit and range limit */

	556 ins v16.2d[1], v17.2d[0]

	557 ins v18.2d[1], v19.2d[0]

	558 ins v20.2d[1], v21.2d[0]

	559 ins v22.2d[1], v23.2d[0]

	560 sqrshrn v16.8b, v16.8h, #2

	561 sqrshrn2 v16.16b, v18.8h, #2

	562 sqrshrn v18.8b, v20.8h, #2

	563 sqrshrn2 v18.16b, v22.8h, #2

	564

	565 /* vpop {v8.4h - d15.4h} / / restore NEON registers */

	566 ld1 {v8.4h - v11.4h}, [sp], 32

	567 ld1 {v12.4h - v15.4h}, [sp], 32

	568 ins v24.2d[1], v25.2d[0]

	569

	570 sqrshrn v20.8b, v24.8h, #2

	571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */

	572 /* trn1 v16.8h, v16.8h, v18.8h */

	573 transpose v16, v18, v3, .16b, .8h

	574 ins v26.2d[1], v27.2d[0]

	575 ins v28.2d[1], v29.2d[0]

	576 ins v30.2d[1], v31.2d[0]

	577 sqrshrn2 v20.16b, v26.8h, #2

	578 sqrshrn v22.8b, v28.8h, #2

	579 movi v0.16b, #(CENTERJSAMPLE)

	580 sqrshrn2 v22.16b, v30.8h, #2

	581 transpose_single v16, v17, v3, .2d, .8b

	582 transpose_single v18, v19, v3, .2d, .8b

	583 add v16.8b, v16.8b, v0.8b

	584 add v17.8b, v17.8b, v0.8b

	585 add v18.8b, v18.8b, v0.8b

	586 add v19.8b, v19.8b, v0.8b

	587 transpose v20, v22, v3, .16b, .8h

	588 /* Store results to the output buffer */

	589 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	590 add TMP1, TMP1, OUTPUT_COL

	591 add TMP2, TMP2, OUTPUT_COL

	592 st1 {v16.8b}, [TMP1]

	593 transpose_single v20, v21, v3, .2d, .8b

	594 st1 {v17.8b}, [TMP2]

	595 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	596 add TMP1, TMP1, OUTPUT_COL

	597 add TMP2, TMP2, OUTPUT_COL

	598 st1 {v18.8b}, [TMP1]

	599 add v20.8b, v20.8b, v0.8b

	600 add v21.8b, v21.8b, v0.8b

	601 st1 {v19.8b}, [TMP2]

	602 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	603 ldp TMP3, TMP4, [OUTPUT_BUF]

	604 add TMP1, TMP1, OUTPUT_COL

	605 add TMP2, TMP2, OUTPUT_COL

	606 add TMP3, TMP3, OUTPUT_COL

	607 add TMP4, TMP4, OUTPUT_COL

	608 transpose_single v22, v23, v3, .2d, .8b

	609 st1 {v20.8b}, [TMP1]

	610 add v22.8b, v22.8b, v0.8b

	611 add v23.8b, v23.8b, v0.8b

	612 st1 {v21.8b}, [TMP2]

	613 st1 {v22.8b}, [TMP3]

	614 st1 {v23.8b}, [TMP4]

	615 ldr x15, [sp], 16

	616 ld1 {v0.8b - v3.8b}, [sp], 32

	617 ld1 {v4.8b - v7.8b}, [sp], 32

	618 ld1 {v8.8b - v11.8b}, [sp], 32

	619 ld1 {v12.8b - v15.8b}, [sp], 32

	620 ld1 {v16.8b - v19.8b}, [sp], 32

	621 ld1 {v20.8b - v23.8b}, [sp], 32

	622 ld1 {v24.8b - v27.8b}, [sp], 32

	623 ld1 {v28.8b - v31.8b}, [sp], 32

	624 blr x30

	625

	626 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */

	627

	628 /* Transpose left 4x8 half */

	629 transpose ROW6L, ROW7L, v3, .16b, .4h

	630 transpose ROW2L, ROW3L, v3, .16b, .4h

	631 transpose ROW0L, ROW1L, v3, .16b, .4h

	632 transpose ROW4L, ROW5L, v3, .16b, .4h

	633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */

	634 transpose ROW1L, ROW3L, v3, .16b, .2s

	635 transpose ROW4L, ROW6L, v3, .16b, .2s

	636 transpose ROW0L, ROW2L, v3, .16b, .2s

	637 transpose ROW5L, ROW7L, v3, .16b, .2s

	638 cmp x0, #0

	639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */

	640

	641 /* Only row 0 is non-zero for the right 4x8 half */

	642 dup ROW1R.4h, ROW0R.4h[1]

	643 dup ROW2R.4h, ROW0R.4h[2]

	644 dup ROW3R.4h, ROW0R.4h[3]

	645 dup ROW4R.4h, ROW0R.4h[0]

	646 dup ROW5R.4h, ROW0R.4h[1]

	647 dup ROW6R.4h, ROW0R.4h[2]

	648 dup ROW7R.4h, ROW0R.4h[3]

	649 dup ROW0R.4h, ROW0R.4h[0]

	650 b 1b /* Go to 'normal' second pass */

	651

	652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */

	653 ld1 {v2.4h}, [x15] /* reload constants */

	654 smull v12.4s, ROW1L.4h, XFIX_1_175875602

	655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560

	656 smull v14.4s, ROW3L.4h, XFIX_1_175875602

	657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644

	658 smull v4.4s, ROW2L.4h, XFIX_0_541196100

	659 sshll v6.4s, ROW0L.4h, #13

	660 mov v8.16b, v12.16b

	661 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	662 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	663 add v2.4s, v6.4s, v4.4s

	664 mov v10.16b, v14.16b

	665 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	666 add v2.4s, v2.4s, v12.4s

	667 add v12.4s, v12.4s, v12.4s

	668 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	669 shrn ROW1L.4h, v2.4s, #16

	670 sub v2.4s, v2.4s, v12.4s

	671 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	672 sub v6.4s, v6.4s, v4.4s

	673 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	674 add v2.4s, v6.4s, v10.4s

	675 sub v6.4s, v6.4s, v10.4s

	676 sshll v10.4s, ROW0L.4h, #13

	677 shrn ROW2L.4h, v2.4s, #16

	678 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	679 add v4.4s, v10.4s, v12.4s

	680 sub v2.4s, v10.4s, v12.4s

	681 add v12.4s, v4.4s, v14.4s

	682 sub v4.4s, v4.4s, v14.4s

	683 add v10.4s, v2.4s, v8.4s

	684 sub v6.4s, v2.4s, v8.4s

	685 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	686 shrn ROW3L.4h, v10.4s, #16

	687 shrn ROW0L.4h, v12.4s, #16

	688 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	689 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */

	690 ld1 {v2.4h}, [x15] /* reload constants */

	691 smull v12.4s, ROW5L.4h, XFIX_1_175875602

	692 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560

	693 smull v14.4s, ROW7L.4h, XFIX_1_175875602

	694 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644

	695 smull v4.4s, ROW6L.4h, XFIX_0_541196100

	696 sshll v6.4s, ROW4L.4h, #13

	697 mov v8.16b, v12.16b

	698 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447

	699 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223

	700 add v2.4s, v6.4s, v4.4s

	701 mov v10.16b, v14.16b

	702 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223

	703 add v2.4s, v2.4s, v12.4s

	704 add v12.4s, v12.4s, v12.4s

	705 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447

	706 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	707 sub v2.4s, v2.4s, v12.4s

	708 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865

	709 sub v6.4s, v6.4s, v4.4s

	710 shrn ROW6R.4h, v2.4s, #16

	711 add v2.4s, v6.4s, v10.4s

	712 sub v6.4s, v6.4s, v10.4s

	713 sshll v10.4s, ROW4L.4h, #13

	714 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	715 shrn ROW5R.4h, v6.4s, #16

	716 add v4.4s, v10.4s, v12.4s

	717 sub v2.4s, v10.4s, v12.4s

	718 add v12.4s, v4.4s, v14.4s

	719 sub v4.4s, v4.4s, v14.4s

	720 add v10.4s, v2.4s, v8.4s

	721 sub v6.4s, v2.4s, v8.4s

	722 shrn ROW7R.4h, v4.4s, #16

	723 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	724 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	725 shrn ROW4R.4h, v6.4s, #16

	726 b 2b /* Go to epilogue */

	727

	728 .unreq DCT_TABLE

	729 .unreq COEF_BLOCK

	730 .unreq OUTPUT_BUF

	731 .unreq OUTPUT_COL

	732 .unreq TMP1

	733 .unreq TMP2

	734 .unreq TMP3

	735 .unreq TMP4

	736

	737 .unreq ROW0L

	738 .unreq ROW0R

	739 .unreq ROW1L

	740 .unreq ROW1R

	741 .unreq ROW2L

	742 .unreq ROW2R

	743 .unreq ROW3L

	744 .unreq ROW3R

	745 .unreq ROW4L

	746 .unreq ROW4R

	747 .unreq ROW5L

	748 .unreq ROW5R

	749 .unreq ROW6L

	750 .unreq ROW6R

	751 .unreq ROW7L

	752 .unreq ROW7R

	753

	754

	755 /*****************************************************************************/

	756

	757 /*

	758 * jsimd_idct_ifast_neon

	759 *

	760 * This function contains a fast, not so accurate integer implementation of

	761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations

	762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'

	763 * function from jidctfst.c

	764 *

	765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.

	766 * But in ARM NEON case some extra additions are required because VQDMULH

	767 * instruction can't handle the constants larger than 1. So the expressions

	768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",

	769 * which introduces an extra addition. Overall, there are 6 extra additions

	770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.

	771 */

	772

	773 #define XFIX_1_082392200 v0.4h[0]

	774 #define XFIX_1_414213562 v0.4h[1]

	775 #define XFIX_1_847759065 v0.4h[2]

	776 #define XFIX_2_613125930 v0.4h[3]

	777

	778 .balign 16

	779 jsimd_idct_ifast_neon_consts:

	780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */

	781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */

	782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */

	783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */

	784

	785 asm_function jsimd_idct_ifast_neon

	786

	787 DCT_TABLE .req x0

	788 COEF_BLOCK .req x1

	789 OUTPUT_BUF .req x2

	790 OUTPUT_COL .req x3

	791 TMP1 .req x0

	792 TMP2 .req x1

	793 TMP3 .req x2

	794 TMP4 .req x22

	795 TMP5 .req x23

	796

	797 /* Load and dequantize coefficients into NEON registers

	798 * with the following allocation:

	799 * 0 1 2 3 \| 4 5 6 7

	800 * ---------+--------

	801 * 0 \| d16 \| d17 ( v8.8h )

	802 * 1 \| d18 \| d19 ( v9.8h )

	803 * 2 \| d20 \| d21 ( v10.8h )

	804 * 3 \| d22 \| d23 ( v11.8h )

	805 * 4 \| d24 \| d25 ( v12.8h )

	806 * 5 \| d26 \| d27 ( v13.8h )

	807 * 6 \| d28 \| d29 ( v14.8h )

	808 * 7 \| d30 \| d31 ( v15.8h )

	809 */

	810 /* Save NEON registers used in fast IDCT */

	811 sub sp, sp, #176

	812 stp x22, x23, [sp], 16

	813 adr x23, jsimd_idct_ifast_neon_consts

	814 st1 {v0.8b - v3.8b}, [sp], 32

	815 st1 {v4.8b - v7.8b}, [sp], 32

	816 st1 {v8.8b - v11.8b}, [sp], 32

	817 st1 {v12.8b - v15.8b}, [sp], 32

	818 st1 {v16.8b - v19.8b}, [sp], 32

	819 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32

	820 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

	821 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32

	822 mul v8.8h, v8.8h, v0.8h

	823 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

	824 mul v9.8h, v9.8h, v1.8h

	825 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32

	826 mul v10.8h, v10.8h, v2.8h

	827 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

	828 mul v11.8h, v11.8h, v3.8h

	829 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32

	830 mul v12.8h, v12.8h, v0.8h

	831 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

	832 mul v14.8h, v14.8h, v2.8h

	833 mul v13.8h, v13.8h, v1.8h

	834 ld1 {v0.4h}, [x23] /* load constants */

	835 mul v15.8h, v15.8h, v3.8h

	836

	837 /* 1-D IDCT, pass 1 */

	838 sub v2.8h, v10.8h, v14.8h

	839 add v14.8h, v10.8h, v14.8h

	840 sub v1.8h, v11.8h, v13.8h

	841 add v13.8h, v11.8h, v13.8h

	842 sub v5.8h, v9.8h, v15.8h

	843 add v15.8h, v9.8h, v15.8h

	844 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

	845 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

	846 add v3.8h, v1.8h, v1.8h

	847 sub v1.8h, v5.8h, v1.8h

	848 add v10.8h, v2.8h, v4.8h

	849 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

	850 sub v2.8h, v15.8h, v13.8h

	851 add v3.8h, v3.8h, v6.8h

	852 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

	853 add v1.8h, v1.8h, v4.8h

	854 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

	855 sub v10.8h, v10.8h, v14.8h

	856 add v2.8h, v2.8h, v6.8h

	857 sub v6.8h, v8.8h, v12.8h

	858 add v12.8h, v8.8h, v12.8h

	859 add v9.8h, v5.8h, v4.8h

	860 add v5.8h, v6.8h, v10.8h

	861 sub v10.8h, v6.8h, v10.8h

	862 add v6.8h, v15.8h, v13.8h

	863 add v8.8h, v12.8h, v14.8h

	864 sub v3.8h, v6.8h, v3.8h

	865 sub v12.8h, v12.8h, v14.8h

	866 sub v3.8h, v3.8h, v1.8h

	867 sub v1.8h, v9.8h, v1.8h

	868 add v2.8h, v3.8h, v2.8h

	869 sub v15.8h, v8.8h, v6.8h

	870 add v1.8h, v1.8h, v2.8h

	871 add v8.8h, v8.8h, v6.8h

	872 add v14.8h, v5.8h, v3.8h

	873 sub v9.8h, v5.8h, v3.8h

	874 sub v13.8h, v10.8h, v2.8h

	875 add v10.8h, v10.8h, v2.8h

	876 /* Transpose q8-q9 */

	877 mov v18.16b, v8.16b

	878 trn1 v8.8h, v8.8h, v9.8h

	879 trn2 v9.8h, v18.8h, v9.8h

	880 sub v11.8h, v12.8h, v1.8h

	881 /* Transpose q14-q15 */

	882 mov v18.16b, v14.16b

	883 trn1 v14.8h, v14.8h, v15.8h

	884 trn2 v15.8h, v18.8h, v15.8h

	885 add v12.8h, v12.8h, v1.8h

	886 /* Transpose q10-q11 */

	887 mov v18.16b, v10.16b

	888 trn1 v10.8h, v10.8h, v11.8h

	889 trn2 v11.8h, v18.8h, v11.8h

	890 /* Transpose q12-q13 */

	891 mov v18.16b, v12.16b

	892 trn1 v12.8h, v12.8h, v13.8h

	893 trn2 v13.8h, v18.8h, v13.8h

	894 /* Transpose q9-q11 */

	895 mov v18.16b, v9.16b

	896 trn1 v9.4s, v9.4s, v11.4s

	897 trn2 v11.4s, v18.4s, v11.4s

	898 /* Transpose q12-q14 */

	899 mov v18.16b, v12.16b

	900 trn1 v12.4s, v12.4s, v14.4s

	901 trn2 v14.4s, v18.4s, v14.4s

	902 /* Transpose q8-q10 */

	903 mov v18.16b, v8.16b

	904 trn1 v8.4s, v8.4s, v10.4s

	905 trn2 v10.4s, v18.4s, v10.4s

	906 /* Transpose q13-q15 */

	907 mov v18.16b, v13.16b

	908 trn1 v13.4s, v13.4s, v15.4s

	909 trn2 v15.4s, v18.4s, v15.4s

	910 /* vswp v14.4h, v10-MSB.4h */

	911 umov x22, v14.d[0]

	912 ins v14.2d[0], v10.2d[1]

	913 ins v10.2d[1], x22

	914 /* vswp v13.4h, v9MSB.4h */

	915

	916 umov x22, v13.d[0]

	917 ins v13.2d[0], v9.2d[1]

	918 ins v9.2d[1], x22

	919 /* 1-D IDCT, pass 2 */

	920 sub v2.8h, v10.8h, v14.8h

	921 /* vswp v15.4h, v11MSB.4h */

	922 umov x22, v15.d[0]

	923 ins v15.2d[0], v11.2d[1]

	924 ins v11.2d[1], x22

	925 add v14.8h, v10.8h, v14.8h

	926 /* vswp v12.4h, v8-MSB.4h */

	927 umov x22, v12.d[0]

	928 ins v12.2d[0], v8.2d[1]

	929 ins v8.2d[1], x22

	930 sub v1.8h, v11.8h, v13.8h

	931 add v13.8h, v11.8h, v13.8h

	932 sub v5.8h, v9.8h, v15.8h

	933 add v15.8h, v9.8h, v15.8h

	934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

	935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

	936 add v3.8h, v1.8h, v1.8h

	937 sub v1.8h, v5.8h, v1.8h

	938 add v10.8h, v2.8h, v4.8h

	939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

	940 sub v2.8h, v15.8h, v13.8h

	941 add v3.8h, v3.8h, v6.8h

	942 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

	943 add v1.8h, v1.8h, v4.8h

	944 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

	945 sub v10.8h, v10.8h, v14.8h

	946 add v2.8h, v2.8h, v6.8h

	947 sub v6.8h, v8.8h, v12.8h

	948 add v12.8h, v8.8h, v12.8h

	949 add v9.8h, v5.8h, v4.8h

	950 add v5.8h, v6.8h, v10.8h

	951 sub v10.8h, v6.8h, v10.8h

	952 add v6.8h, v15.8h, v13.8h

	953 add v8.8h, v12.8h, v14.8h

	954 sub v3.8h, v6.8h, v3.8h

	955 sub v12.8h, v12.8h, v14.8h

	956 sub v3.8h, v3.8h, v1.8h

	957 sub v1.8h, v9.8h, v1.8h

	958 add v2.8h, v3.8h, v2.8h

	959 sub v15.8h, v8.8h, v6.8h

	960 add v1.8h, v1.8h, v2.8h

	961 add v8.8h, v8.8h, v6.8h

	962 add v14.8h, v5.8h, v3.8h

	963 sub v9.8h, v5.8h, v3.8h

	964 sub v13.8h, v10.8h, v2.8h

	965 add v10.8h, v10.8h, v2.8h

	966 sub v11.8h, v12.8h, v1.8h

	967 add v12.8h, v12.8h, v1.8h

	968 /* Descale to 8-bit and range limit */

	969 movi v0.16b, #0x80

	970 sqshrn v8.8b, v8.8h, #5

	971 sqshrn2 v8.16b, v9.8h, #5

	972 sqshrn v9.8b, v10.8h, #5

	973 sqshrn2 v9.16b, v11.8h, #5

	974 sqshrn v10.8b, v12.8h, #5

	975 sqshrn2 v10.16b, v13.8h, #5

	976 sqshrn v11.8b, v14.8h, #5

	977 sqshrn2 v11.16b, v15.8h, #5

	978 add v8.16b, v8.16b, v0.16b

	979 add v9.16b, v9.16b, v0.16b

	980 add v10.16b, v10.16b, v0.16b

	981 add v11.16b, v11.16b, v0.16b

	982 /* Transpose the final 8-bit samples */

	983 /* Transpose q8-q9 */

	984 mov v18.16b, v8.16b

	985 trn1 v8.8h, v8.8h, v9.8h

	986 trn2 v9.8h, v18.8h, v9.8h

	987 /* Transpose q10-q11 */

	988 mov v18.16b, v10.16b

	989 trn1 v10.8h, v10.8h, v11.8h

	990 trn2 v11.8h, v18.8h, v11.8h

	991 /* Transpose q8-q10 */

	992 mov v18.16b, v8.16b

	993 trn1 v8.4s, v8.4s, v10.4s

	994 trn2 v10.4s, v18.4s, v10.4s

	995 /* Transpose q9-q11 */

	996 mov v18.16b, v9.16b

	997 trn1 v9.4s, v9.4s, v11.4s

	998 trn2 v11.4s, v18.4s, v11.4s

	999 /* make copy */

	1000 ins v17.2d[0], v8.2d[1]

	1001 /* Transpose d16-d17-msb */

	1002 mov v18.16b, v8.16b

	1003 trn1 v8.8b, v8.8b, v17.8b

	1004 trn2 v17.8b, v18.8b, v17.8b

	1005 /* make copy */

	1006 ins v19.2d[0], v9.2d[1]

	1007 mov v18.16b, v9.16b

	1008 trn1 v9.8b, v9.8b, v19.8b

	1009 trn2 v19.8b, v18.8b, v19.8b

	1010 /* Store results to the output buffer */

	1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1012 add TMP1, TMP1, OUTPUT_COL

	1013 add TMP2, TMP2, OUTPUT_COL

	1014 st1 {v8.8b}, [TMP1]

	1015 st1 {v17.8b}, [TMP2]

	1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1017 add TMP1, TMP1, OUTPUT_COL

	1018 add TMP2, TMP2, OUTPUT_COL

	1019 st1 {v9.8b}, [TMP1]

	1020 /* make copy */

	1021 ins v7.2d[0], v10.2d[1]

	1022 mov v18.16b, v10.16b

	1023 trn1 v10.8b, v10.8b, v7.8b

	1024 trn2 v7.8b, v18.8b, v7.8b

	1025 st1 {v19.8b}, [TMP2]

	1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16

	1028 add TMP1, TMP1, OUTPUT_COL

	1029 add TMP2, TMP2, OUTPUT_COL

	1030 add TMP4, TMP4, OUTPUT_COL

	1031 add TMP5, TMP5, OUTPUT_COL

	1032 st1 {v10.8b}, [TMP1]

	1033 /* make copy */

	1034 ins v16.2d[0], v11.2d[1]

	1035 mov v18.16b, v11.16b

	1036 trn1 v11.8b, v11.8b, v16.8b

	1037 trn2 v16.8b, v18.8b, v16.8b

	1038 st1 {v7.8b}, [TMP2]

	1039 st1 {v11.8b}, [TMP4]

	1040 st1 {v16.8b}, [TMP5]

	1041 sub sp, sp, #176

	1042 ldp x22, x23, [sp], 16

	1043 ld1 {v0.8b - v3.8b}, [sp], 32

	1044 ld1 {v4.8b - v7.8b}, [sp], 32

	1045 ld1 {v8.8b - v11.8b}, [sp], 32

	1046 ld1 {v12.8b - v15.8b}, [sp], 32

	1047 ld1 {v16.8b - v19.8b}, [sp], 32

	1048 blr x30

	1049

	1050 .unreq DCT_TABLE

	1051 .unreq COEF_BLOCK

	1052 .unreq OUTPUT_BUF

	1053 .unreq OUTPUT_COL

	1054 .unreq TMP1

	1055 .unreq TMP2

	1056 .unreq TMP3

	1057 .unreq TMP4

	1058

	1059

	1060 /*****************************************************************************/

	1061

	1062 /*

	1063 * jsimd_idct_4x4_neon

	1064 *

	1065 * This function contains inverse-DCT code for getting reduced-size

	1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations

	1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'

	1068 * function from jpeg-6b (jidctred.c).

	1069 *

	1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which

	1071 * requires much less arithmetic operations and hence should be faster.

	1072 * The primary purpose of this particular NEON optimized function is

	1073 * bit exact compatibility with jpeg-6b.

	1074 *

	1075 * TODO: a bit better instructions scheduling can be achieved by expanding

	1076 * idct_helper/transpose_4x4 macros and reordering instructions,

	1077 * but readability will suffer somewhat.

	1078 */

	1079

	1080 #define CONST_BITS 13

	1081

	1082 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */

	1083 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */

	1084 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */

	1085 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */

	1086 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */

	1087 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */

	1088 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */

	1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */

	1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */

	1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */

	1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */

	1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */

	1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */

	1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */

	1096

	1097 .balign 16

	1098 jsimd_idct_4x4_neon_consts:

	1099 .short FIX_1_847759065 /* v0.4h[0] */

	1100 .short -FIX_0_765366865 /* v0.4h[1] */

	1101 .short -FIX_0_211164243 /* v0.4h[2] */

	1102 .short FIX_1_451774981 /* v0.4h[3] */

	1103 .short -FIX_2_172734803 /* d1[0] */

	1104 .short FIX_1_061594337 /* d1[1] */

	1105 .short -FIX_0_509795579 /* d1[2] */

	1106 .short -FIX_0_601344887 /* d1[3] */

	1107 .short FIX_0_899976223 /* v2.4h[0] */

	1108 .short FIX_2_562915447 /* v2.4h[1] */

	1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */

	1110 .short 0 /* v2.4h[3] */

	1111

	1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29

	1113 smull v28.4s, \x4, v2.4h[2]

	1114 smlal v28.4s, \x8, v0.4h[0]

	1115 smlal v28.4s, \x14, v0.4h[1]

	1116

	1117 smull v26.4s, \x16, v1.4h[2]

	1118 smlal v26.4s, \x12, v1.4h[3]

	1119 smlal v26.4s, \x10, v2.4h[0]

	1120 smlal v26.4s, \x6, v2.4h[1]

	1121

	1122 smull v30.4s, \x4, v2.4h[2]

	1123 smlsl v30.4s, \x8, v0.4h[0]

	1124 smlsl v30.4s, \x14, v0.4h[1]

	1125

	1126 smull v24.4s, \x16, v0.4h[2]

	1127 smlal v24.4s, \x12, v0.4h[3]

	1128 smlal v24.4s, \x10, v1.4h[0]

	1129 smlal v24.4s, \x6, v1.4h[1]

	1130

	1131 add v20.4s, v28.4s, v26.4s

	1132 sub v28.4s, v28.4s, v26.4s

	1133

	1134 .if \shift > 16

	1135 srshr v20.4s, v20.4s, #\shift

	1136 srshr v28.4s, v28.4s, #\shift

	1137 xtn \y26, v20.4s

	1138 xtn \y29, v28.4s

	1139 .else

	1140 rshrn \y26, v20.4s, #\shift

	1141 rshrn \y29, v28.4s, #\shift

	1142 .endif

	1143

	1144 add v20.4s, v30.4s, v24.4s

	1145 sub v30.4s, v30.4s, v24.4s

	1146

	1147 .if \shift > 16

	1148 srshr v20.4s, v20.4s, #\shift

	1149 srshr v30.4s, v30.4s, #\shift

	1150 xtn \y27, v20.4s

	1151 xtn \y28, v30.4s

	1152 .else

	1153 rshrn \y27, v20.4s, #\shift

	1154 rshrn \y28, v30.4s, #\shift

	1155 .endif

	1156

	1157 .endm

	1158

	1159 asm_function jsimd_idct_4x4_neon

	1160

	1161 DCT_TABLE .req x0

	1162 COEF_BLOCK .req x1

	1163 OUTPUT_BUF .req x2

	1164 OUTPUT_COL .req x3

	1165 TMP1 .req x0

	1166 TMP2 .req x1

	1167 TMP3 .req x2

	1168 TMP4 .req x15

	1169

	1170 /* Save all used NEON registers */

	1171 sub sp, sp, 272

	1172 str x15, [sp], 16

	1173 /* Load constants (v3.4h is just used for padding) */

	1174 adr TMP4, jsimd_idct_4x4_neon_consts

	1175 st1 {v0.8b - v3.8b}, [sp], 32

	1176 st1 {v4.8b - v7.8b}, [sp], 32

	1177 st1 {v8.8b - v11.8b}, [sp], 32

	1178 st1 {v12.8b - v15.8b}, [sp], 32

	1179 st1 {v16.8b - v19.8b}, [sp], 32

	1180 st1 {v20.8b - v23.8b}, [sp], 32

	1181 st1 {v24.8b - v27.8b}, [sp], 32

	1182 st1 {v28.8b - v31.8b}, [sp], 32

	1183 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]

	1184

	1185 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	1186 * 0 1 2 3 \| 4 5 6 7

	1187 * ---------+--------

	1188 * 0 \| v4.4h \| v5.4h

	1189 * 1 \| v6.4h \| v7.4h

	1190 * 2 \| v8.4h \| v9.4h

	1191 * 3 \| v10.4h \| v11.4h

	1192 * 4 \| - \| -

	1193 * 5 \| v12.4h \| v13.4h

	1194 * 6 \| v14.4h \| v15.4h

	1195 * 7 \| v16.4h \| v17.4h

	1196 */

	1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

	1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32

	1199 add COEF_BLOCK, COEF_BLOCK, #16

	1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32

	1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

	1202 /* dequantize */

	1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

	1204 mul v4.4h, v4.4h, v18.4h

	1205 mul v5.4h, v5.4h, v19.4h

	1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */

	1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32

	1208 mul v6.4h, v6.4h, v20.4h

	1209 mul v7.4h, v7.4h, v21.4h

	1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */

	1211 mul v8.4h, v8.4h, v22.4h

	1212 mul v9.4h, v9.4h, v23.4h

	1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */

	1214 add DCT_TABLE, DCT_TABLE, #16

	1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32

	1216 mul v10.4h, v10.4h, v24.4h

	1217 mul v11.4h, v11.4h, v25.4h

	1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */

	1219 mul v12.4h, v12.4h, v26.4h

	1220 mul v13.4h, v13.4h, v27.4h

	1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */

	1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

	1223 mul v14.4h, v14.4h, v28.4h

	1224 mul v15.4h, v15.4h, v29.4h

	1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */

	1226 mul v16.4h, v16.4h, v30.4h

	1227 mul v17.4h, v17.4h, v31.4h

	1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */

	1229

	1230 /* Pass 1 */

	1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h

	1232 transpose_4x4 v4, v6, v8, v10, v3

	1233 ins v10.2d[1], v11.2d[0]

	1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h

	1235 transpose_4x4 v5, v7, v9, v11, v3

	1236 ins v10.2d[1], v11.2d[0]

	1237 /* Pass 2 */

	1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h

	1239 transpose_4x4 v26, v27, v28, v29, v3

	1240

	1241 /* Range limit */

	1242 movi v30.8h, #0x80

	1243 ins v26.2d[1], v27.2d[0]

	1244 ins v28.2d[1], v29.2d[0]

	1245 add v26.8h, v26.8h, v30.8h

	1246 add v28.8h, v28.8h, v30.8h

	1247 sqxtun v26.8b, v26.8h

	1248 sqxtun v27.8b, v28.8h

	1249

	1250 /* Store results to the output buffer */

	1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	1252 ldp TMP3, TMP4, [OUTPUT_BUF]

	1253 add TMP1, TMP1, OUTPUT_COL

	1254 add TMP2, TMP2, OUTPUT_COL

	1255 add TMP3, TMP3, OUTPUT_COL

	1256 add TMP4, TMP4, OUTPUT_COL

	1257

	1258 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT

	1259 /* We can use much less instructions on little endian systems if the

	1260 * OS kernel is not configured to trap unaligned memory accesses

	1261 */

	1262 st1 {v26.s}[0], [TMP1], 4

	1263 st1 {v27.s}[0], [TMP3], 4

	1264 st1 {v26.s}[1], [TMP2], 4

	1265 st1 {v27.s}[1], [TMP4], 4

	1266 #else

	1267 st1 {v26.b}[0], [TMP1], 1

	1268 st1 {v27.b}[0], [TMP3], 1

	1269 st1 {v26.b}[1], [TMP1], 1

	1270 st1 {v27.b}[1], [TMP3], 1

	1271 st1 {v26.b}[2], [TMP1], 1

	1272 st1 {v27.b}[2], [TMP3], 1

	1273 st1 {v26.b}[3], [TMP1], 1

	1274 st1 {v27.b}[3], [TMP3], 1

	1275

	1276 st1 {v26.b}[4], [TMP2], 1

	1277 st1 {v27.b}[4], [TMP4], 1

	1278 st1 {v26.b}[5], [TMP2], 1

	1279 st1 {v27.b}[5], [TMP4], 1

	1280 st1 {v26.b}[6], [TMP2], 1

	1281 st1 {v27.b}[6], [TMP4], 1

	1282 st1 {v26.b}[7], [TMP2], 1

	1283 st1 {v27.b}[7], [TMP4], 1

	1284 #endif

	1285

	1286 /* vpop {v8.4h - v15.4h} ;not available */

	1287 sub sp, sp, #272

	1288 ldr x15, [sp], 16

	1289 ld1 {v0.8b - v3.8b}, [sp], 32

	1290 ld1 {v4.8b - v7.8b}, [sp], 32

	1291 ld1 {v8.8b - v11.8b}, [sp], 32

	1292 ld1 {v12.8b - v15.8b}, [sp], 32

	1293 ld1 {v16.8b - v19.8b}, [sp], 32

	1294 ld1 {v20.8b - v23.8b}, [sp], 32

	1295 ld1 {v24.8b - v27.8b}, [sp], 32

	1296 ld1 {v28.8b - v31.8b}, [sp], 32

	1297 blr x30

	1298

	1299 .unreq DCT_TABLE

	1300 .unreq COEF_BLOCK

	1301 .unreq OUTPUT_BUF

	1302 .unreq OUTPUT_COL

	1303 .unreq TMP1

	1304 .unreq TMP2

	1305 .unreq TMP3

	1306 .unreq TMP4

	1307

	1308 .purgem idct_helper

	1309

	1310

	1311 /*****************************************************************************/

	1312

	1313 /*

	1314 * jsimd_idct_2x2_neon

	1315 *

	1316 * This function contains inverse-DCT code for getting reduced-size

	1317 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations

	1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'

	1319 * function from jpeg-6b (jidctred.c).

	1320 *

	1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which

	1322 * requires much less arithmetic operations and hence should be faster.

	1323 * The primary purpose of this particular NEON optimized function is

	1324 * bit exact compatibility with jpeg-6b.

	1325 */

	1326

	1327 .balign 8

	1328 jsimd_idct_2x2_neon_consts:

	1329 .short -FIX_0_720959822 /* v14[0] */

	1330 .short FIX_0_850430095 /* v14[1] */

	1331 .short -FIX_1_272758580 /* v14[2] */

	1332 .short FIX_3_624509785 /* v14[3] */

	1333

	1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27

	1335 sshll v15.4s, \x4, #15

	1336 smull v26.4s, \x6, v14.4h[3]

	1337 smlal v26.4s, \x10, v14.4h[2]

	1338 smlal v26.4s, \x12, v14.4h[1]

	1339 smlal v26.4s, \x16, v14.4h[0]

	1340

	1341 add v20.4s, v15.4s, v26.4s

	1342 sub v15.4s, v15.4s, v26.4s

	1343

	1344 .if \shift > 16

	1345 srshr v20.4s, v20.4s, #\shift

	1346 srshr v15.4s, v15.4s, #\shift

	1347 xtn \y26, v20.4s

	1348 xtn \y27, v15.4s

	1349 .else

	1350 rshrn \y26, v20.4s, #\shift

	1351 rshrn \y27, v15.4s, #\shift

	1352 .endif

	1353

	1354 .endm

	1355

	1356 asm_function jsimd_idct_2x2_neon

	1357

	1358 DCT_TABLE .req x0

	1359 COEF_BLOCK .req x1

	1360 OUTPUT_BUF .req x2

	1361 OUTPUT_COL .req x3

	1362 TMP1 .req x0

	1363 TMP2 .req x15

	1364

	1365 /* vpush {v8.4h - v15.4h} ; not available */

	1366 sub sp, sp, 208

	1367 str x15, [sp], 16

	1368

	1369 /* Load constants */

	1370 adr TMP2, jsimd_idct_2x2_neon_consts

	1371 st1 {v4.8b - v7.8b}, [sp], 32

	1372 st1 {v8.8b - v11.8b}, [sp], 32

	1373 st1 {v12.8b - v15.8b}, [sp], 32

	1374 st1 {v16.8b - v19.8b}, [sp], 32

	1375 st1 {v21.8b - v22.8b}, [sp], 16

	1376 st1 {v24.8b - v27.8b}, [sp], 32

	1377 st1 {v30.8b - v31.8b}, [sp], 16

	1378 ld1 {v14.4h}, [TMP2]

	1379

	1380 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	1381 * 0 1 2 3 \| 4 5 6 7

	1382 * ---------+--------

	1383 * 0 \| v4.4h \| v5.4h

	1384 * 1 \| v6.4h \| v7.4h

	1385 * 2 \| - \| -

	1386 * 3 \| v10.4h \| v11.4h

	1387 * 4 \| - \| -

	1388 * 5 \| v12.4h \| v13.4h

	1389 * 6 \| - \| -

	1390 * 7 \| v16.4h \| v17.4h

	1391 */

	1392 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

	1393 add COEF_BLOCK, COEF_BLOCK, #16

	1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16

	1395 add COEF_BLOCK, COEF_BLOCK, #16

	1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16

	1397 add COEF_BLOCK, COEF_BLOCK, #16

	1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

	1399 /* Dequantize */

	1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

	1401 mul v4.4h, v4.4h, v18.4h

	1402 mul v5.4h, v5.4h, v19.4h

	1403 ins v4.2d[1], v5.2d[0]

	1404 mul v6.4h, v6.4h, v20.4h

	1405 mul v7.4h, v7.4h, v21.4h

	1406 ins v6.2d[1], v7.2d[0]

	1407 add DCT_TABLE, DCT_TABLE, #16

	1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16

	1409 mul v10.4h, v10.4h, v24.4h

	1410 mul v11.4h, v11.4h, v25.4h

	1411 ins v10.2d[1], v11.2d[0]

	1412 add DCT_TABLE, DCT_TABLE, #16

	1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16

	1414 mul v12.4h, v12.4h, v26.4h

	1415 mul v13.4h, v13.4h, v27.4h

	1416 ins v12.2d[1], v13.2d[0]

	1417 add DCT_TABLE, DCT_TABLE, #16

	1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

	1419 mul v16.4h, v16.4h, v30.4h

	1420 mul v17.4h, v17.4h, v31.4h

	1421 ins v16.2d[1], v17.2d[0]

	1422

	1423 /* Pass 1 */

	1424 #if 0

	1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h

	1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h

	1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h

	1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h

	1429 #else

	1430 smull v26.4s, v6.4h, v14.4h[3]

	1431 smlal v26.4s, v10.4h, v14.4h[2]

	1432 smlal v26.4s, v12.4h, v14.4h[1]

	1433 smlal v26.4s, v16.4h, v14.4h[0]

	1434 smull v24.4s, v7.4h, v14.4h[3]

	1435 smlal v24.4s, v11.4h, v14.4h[2]

	1436 smlal v24.4s, v13.4h, v14.4h[1]

	1437 smlal v24.4s, v17.4h, v14.4h[0]

	1438 sshll v15.4s, v4.4h, #15

	1439 sshll v30.4s, v5.4h, #15

	1440 add v20.4s, v15.4s, v26.4s

	1441 sub v15.4s, v15.4s, v26.4s

	1442 rshrn v4.4h, v20.4s, #13

	1443 rshrn v6.4h, v15.4s, #13

	1444 add v20.4s, v30.4s, v24.4s

	1445 sub v15.4s, v30.4s, v24.4s

	1446 rshrn v5.4h, v20.4s, #13

	1447 rshrn v7.4h, v15.4s, #13

	1448 ins v4.2d[1], v5.2d[0]

	1449 ins v6.2d[1], v7.2d[0]

	1450 transpose v4, v6, v3, .16b, .8h

	1451 transpose v6, v10, v3, .16b, .4s

	1452 ins v11.2d[0], v10.2d[1]

	1453 ins v7.2d[0], v6.2d[1]

	1454 #endif

	1455

	1456 /* Pass 2 */

	1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h

	1458

	1459 /* Range limit */

	1460 movi v30.8h, #0x80

	1461 ins v26.2d[1], v27.2d[0]

	1462 add v26.8h, v26.8h, v30.8h

	1463 sqxtun v30.8b, v26.8h

	1464 ins v26.2d[0], v30.2d[0]

	1465 sqxtun v27.8b, v26.8h

	1466

	1467 /* Store results to the output buffer */

	1468 ldp TMP1, TMP2, [OUTPUT_BUF]

	1469 add TMP1, TMP1, OUTPUT_COL

	1470 add TMP2, TMP2, OUTPUT_COL

	1471

	1472 st1 {v26.b}[0], [TMP1], 1

	1473 st1 {v27.b}[4], [TMP1], 1

	1474 st1 {v26.b}[1], [TMP2], 1

	1475 st1 {v27.b}[5], [TMP2], 1

	1476

	1477 sub sp, sp, #208

	1478 ldr x15, [sp], 16

	1479 ld1 {v4.8b - v7.8b}, [sp], 32

	1480 ld1 {v8.8b - v11.8b}, [sp], 32

	1481 ld1 {v12.8b - v15.8b}, [sp], 32

	1482 ld1 {v16.8b - v19.8b}, [sp], 32

	1483 ld1 {v21.8b - v22.8b}, [sp], 16

	1484 ld1 {v24.8b - v27.8b}, [sp], 32

	1485 ld1 {v30.8b - v31.8b}, [sp], 16

	1486 blr x30

	1487

	1488 .unreq DCT_TABLE

	1489 .unreq COEF_BLOCK

	1490 .unreq OUTPUT_BUF

	1491 .unreq OUTPUT_COL

	1492 .unreq TMP1

	1493 .unreq TMP2

	1494

	1495 .purgem idct_helper

	1496

	1497

	1498 /*****************************************************************************/

	1499

	1500 /*

	1501 * jsimd_ycc_extrgb_convert_neon

	1502 * jsimd_ycc_extbgr_convert_neon

	1503 * jsimd_ycc_extrgbx_convert_neon

	1504 * jsimd_ycc_extbgrx_convert_neon

	1505 * jsimd_ycc_extxbgr_convert_neon

	1506 * jsimd_ycc_extxrgb_convert_neon

	1507 *

	1508 * Colorspace conversion YCbCr -> RGB

	1509 */

	1510

	1511

	1512 .macro do_load size

	1513 .if \size == 8

	1514 ld1 {v4.8b}, [U], 8

	1515 ld1 {v5.8b}, [V], 8

	1516 ld1 {v0.8b}, [Y], 8

	1517 prfm PLDL1KEEP, [U, #64]

	1518 prfm PLDL1KEEP, [V, #64]

	1519 prfm PLDL1KEEP, [Y, #64]

	1520 .elseif \size == 4

	1521 ld1 {v4.b}[0], [U], 1

	1522 ld1 {v4.b}[1], [U], 1

	1523 ld1 {v4.b}[2], [U], 1

	1524 ld1 {v4.b}[3], [U], 1

	1525 ld1 {v5.b}[0], [V], 1

	1526 ld1 {v5.b}[1], [V], 1

	1527 ld1 {v5.b}[2], [V], 1

	1528 ld1 {v5.b}[3], [V], 1

	1529 ld1 {v0.b}[0], [Y], 1

	1530 ld1 {v0.b}[1], [Y], 1

	1531 ld1 {v0.b}[2], [Y], 1

	1532 ld1 {v0.b}[3], [Y], 1

	1533 .elseif \size == 2

	1534 ld1 {v4.b}[4], [U], 1

	1535 ld1 {v4.b}[5], [U], 1

	1536 ld1 {v5.b}[4], [V], 1

	1537 ld1 {v5.b}[5], [V], 1

	1538 ld1 {v0.b}[4], [Y], 1

	1539 ld1 {v0.b}[5], [Y], 1

	1540 .elseif \size == 1

	1541 ld1 {v4.b}[6], [U], 1

	1542 ld1 {v5.b}[6], [V], 1

	1543 ld1 {v0.b}[6], [Y], 1

	1544 .else

	1545 .error unsupported macroblock size

	1546 .endif

	1547 .endm

	1548

	1549 .macro do_store bpp, size

	1550 .if \bpp == 24

	1551 .if \size == 8

	1552 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24

	1553 .elseif \size == 4

	1554 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3

	1555 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3

	1556 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3

	1557 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3

	1558 .elseif \size == 2

	1559 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3

	1560 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3

	1561 .elseif \size == 1

	1562 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3

	1563 .else

	1564 .error unsupported macroblock size

	1565 .endif

	1566 .elseif \bpp == 32

	1567 .if \size == 8

	1568 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32

	1569 .elseif \size == 4

	1570 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4

	1571 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4

	1572 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4

	1573 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4

	1574 .elseif \size == 2

	1575 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4

	1576 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4

	1577 .elseif \size == 1

	1578 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4

	1579 .else

	1580 .error unsupported macroblock size

	1581 .endif

	1582 .elseif \bpp==16

	1583 .if \size == 8

	1584 st1 {v25.8h}, [RGB],16

	1585 .elseif \size == 4

	1586 st1 {v25.4h}, [RGB],8

	1587 .elseif \size == 2

	1588 st1 {v25.h}[4], [RGB],2

	1589 st1 {v25.h}[5], [RGB],2

	1590 .elseif \size == 1

	1591 st1 {v25.h}[6], [RGB],2

	1592 .else

	1593 .error unsupported macroblock size

	1594 .endif

	1595 .else

	1596 .error unsupported bpp

	1597 .endif

	1598 .endm

	1599

	1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize

	1601

	1602 /*

	1603 * 2-stage pipelined YCbCr->RGB conversion

	1604 */

	1605

	1606 .macro do_yuv_to_rgb_stage1

	1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */

	1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */

	1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

	1617 .endm

	1618

	1619 .macro do_yuv_to_rgb_stage2

	1620 rshrn v20.4h, v20.4s, #15

	1621 rshrn2 v20.8h, v22.4s, #15

	1622 rshrn v24.4h, v24.4s, #14

	1623 rshrn2 v24.8h, v26.4s, #14

	1624 rshrn v28.4h, v28.4s, #14

	1625 rshrn2 v28.8h, v30.4s, #14

	1626 uaddw v20.8h, v20.8h, v0.8b

	1627 uaddw v24.8h, v24.8h, v0.8b

	1628 uaddw v28.8h, v28.8h, v0.8b

	1629 .if \bpp != 16

	1630 sqxtun v1\g_offs\defsize, v20.8h

	1631 sqxtun v1\r_offs\defsize, v24.8h

	1632 sqxtun v1\b_offs\defsize, v28.8h

	1633 .else

	1634 sqshlu v21.8h, v20.8h, #8

	1635 sqshlu v25.8h, v24.8h, #8

	1636 sqshlu v29.8h, v28.8h, #8

	1637 sri v25.8h, v21.8h, #5

	1638 sri v25.8h, v29.8h, #11

	1639 .endif

	1640

	1641 .endm

	1642

	1643 .macro do_yuv_to_rgb_stage2_store_load_stage1

	1644 rshrn v20.4h, v20.4s, #15

	1645 rshrn v24.4h, v24.4s, #14

	1646 rshrn v28.4h, v28.4s, #14

	1647 ld1 {v4.8b}, [U], 8

	1648 rshrn2 v20.8h, v22.4s, #15

	1649 rshrn2 v24.8h, v26.4s, #14

	1650 rshrn2 v28.8h, v30.4s, #14

	1651 ld1 {v5.8b}, [V], 8

	1652 uaddw v20.8h, v20.8h, v0.8b

	1653 uaddw v24.8h, v24.8h, v0.8b

	1654 uaddw v28.8h, v28.8h, v0.8b

	1655 .if \bpp != 16 /************** rgb24/rgb32 *******************************/

	1656 sqxtun v1\g_offs\defsize, v20.8h

	1657 ld1 {v0.8b}, [Y], 8

	1658 sqxtun v1\r_offs\defsize, v24.8h

	1659 prfm PLDL1KEEP, [U, #64]

	1660 prfm PLDL1KEEP, [V, #64]

	1661 prfm PLDL1KEEP, [Y, #64]

	1662 sqxtun v1\b_offs\defsize, v28.8h

	1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

	1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	1671 .else /************************** rgb565 *********************************/

	1672 sqshlu v21.8h, v20.8h, #8

	1673 sqshlu v25.8h, v24.8h, #8

	1674 sqshlu v29.8h, v28.8h, #8

	1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

	1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	1677 ld1 {v0.8b}, [Y], 8

	1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	1682 sri v25.8h, v21.8h, #5

	1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	1685 prfm PLDL1KEEP, [U, #64]

	1686 prfm PLDL1KEEP, [V, #64]

	1687 prfm PLDL1KEEP, [Y, #64]

	1688 sri v25.8h, v29.8h, #11

	1689 .endif

	1690 do_store \bpp, 8

	1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */

	1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

	1693 .endm

	1694

	1695 .macro do_yuv_to_rgb

	1696 do_yuv_to_rgb_stage1

	1697 do_yuv_to_rgb_stage2

	1698 .endm

	1699

	1700 /* Apple gas crashes on adrl, work around that by using adr.

	1701 * But this requires a copy of these constants for each function.

	1702 */

	1703

	1704 .balign 16

	1705 jsimd_ycc_\colorid\()_neon_consts:

	1706 .short 0, 0, 0, 0

	1707 .short 22971, -11277, -23401, 29033

	1708 .short -128, -128, -128, -128

	1709 .short -128, -128, -128, -128

	1710

	1711 asm_function jsimd_ycc_\colorid\()_convert_neon

	1712 OUTPUT_WIDTH .req x0

	1713 INPUT_BUF .req x1

	1714 INPUT_ROW .req x2

	1715 OUTPUT_BUF .req x3

	1716 NUM_ROWS .req x4

	1717

	1718 INPUT_BUF0 .req x5

	1719 INPUT_BUF1 .req x6

	1720 INPUT_BUF2 .req INPUT_BUF

	1721

	1722 RGB .req x7

	1723 Y .req x8

	1724 U .req x9

	1725 V .req x10

	1726 N .req x15

	1727

	1728 sub sp, sp, 336

	1729 str x15, [sp], 16

	1730 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */

	1731 adr x15, jsimd_ycc_\colorid\()_neon_consts

	1732 /* Save NEON registers */

	1733 st1 {v0.8b - v3.8b}, [sp], 32

	1734 st1 {v4.8b - v7.8b}, [sp], 32

	1735 st1 {v8.8b - v11.8b}, [sp], 32

	1736 st1 {v12.8b - v15.8b}, [sp], 32

	1737 st1 {v16.8b - v19.8b}, [sp], 32

	1738 st1 {v20.8b - v23.8b}, [sp], 32

	1739 st1 {v24.8b - v27.8b}, [sp], 32

	1740 st1 {v28.8b - v31.8b}, [sp], 32

	1741 ld1 {v0.4h, v1.4h}, [x15], 16

	1742 ld1 {v2.8h}, [x15]

	1743

	1744 /* Save ARM registers and handle input arguments */

	1745 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */

	1746 stp x4, x5, [sp], 16

	1747 stp x6, x7, [sp], 16

	1748 stp x8, x9, [sp], 16

	1749 stp x10, x30, [sp], 16

	1750 ldr INPUT_BUF0, [INPUT_BUF]

	1751 ldr INPUT_BUF1, [INPUT_BUF, 8]

	1752 ldr INPUT_BUF2, [INPUT_BUF, 16]

	1753 .unreq INPUT_BUF

	1754

	1755 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */

	1756 movi v10.16b, #255

	1757 movi v13.16b, #255

	1758

	1759 /* Outer loop over scanlines */

	1760 cmp NUM_ROWS, #1

	1761 blt 9f

	1762 0:

	1763 lsl x16, INPUT_ROW, #3

	1764 ldr Y, [INPUT_BUF0, x16]

	1765 ldr U, [INPUT_BUF1, x16]

	1766 mov N, OUTPUT_WIDTH

	1767 ldr V, [INPUT_BUF2, x16]

	1768 add INPUT_ROW, INPUT_ROW, #1

	1769 ldr RGB, [OUTPUT_BUF], #8

	1770

	1771 /* Inner loop over pixels */

	1772 subs N, N, #8

	1773 blt 3f

	1774 do_load 8

	1775 do_yuv_to_rgb_stage1

	1776 subs N, N, #8

	1777 blt 2f

	1778 1:

	1779 do_yuv_to_rgb_stage2_store_load_stage1

	1780 subs N, N, #8

	1781 bge 1b

	1782 2:

	1783 do_yuv_to_rgb_stage2

	1784 do_store \bpp, 8

	1785 tst N, #7

	1786 beq 8f

	1787 3:

	1788 tst N, #4

	1789 beq 3f

	1790 do_load 4

	1791 3:

	1792 tst N, #2

	1793 beq 4f

	1794 do_load 2

	1795 4:

	1796 tst N, #1

	1797 beq 5f

	1798 do_load 1

	1799 5:

	1800 do_yuv_to_rgb

	1801 tst N, #4

	1802 beq 6f

	1803 do_store \bpp, 4

	1804 6:

	1805 tst N, #2

	1806 beq 7f

	1807 do_store \bpp, 2

	1808 7:

	1809 tst N, #1

	1810 beq 8f

	1811 do_store \bpp, 1

	1812 8:

	1813 subs NUM_ROWS, NUM_ROWS, #1

	1814 bgt 0b

	1815 9:

	1816 /* Restore all registers and return */

	1817 sub sp, sp, #336

	1818 ldr x15, [sp], 16

	1819 ld1 {v0.8b - v3.8b}, [sp], 32

	1820 ld1 {v4.8b - v7.8b}, [sp], 32

	1821 ld1 {v8.8b - v11.8b}, [sp], 32

	1822 ld1 {v12.8b - v15.8b}, [sp], 32

	1823 ld1 {v16.8b - v19.8b}, [sp], 32

	1824 ld1 {v20.8b - v23.8b}, [sp], 32

	1825 ld1 {v24.8b - v27.8b}, [sp], 32

	1826 ld1 {v28.8b - v31.8b}, [sp], 32

	1827 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */

	1828 ldp x4, x5, [sp], 16

	1829 ldp x6, x7, [sp], 16

	1830 ldp x8, x9, [sp], 16

	1831 ldp x10, x30, [sp], 16

	1832 br x30

	1833 .unreq OUTPUT_WIDTH

	1834 .unreq INPUT_ROW

	1835 .unreq OUTPUT_BUF

	1836 .unreq NUM_ROWS

	1837 .unreq INPUT_BUF0

	1838 .unreq INPUT_BUF1

	1839 .unreq INPUT_BUF2

	1840 .unreq RGB

	1841 .unreq Y

	1842 .unreq U

	1843 .unreq V

	1844 .unreq N

	1845

	1846 .purgem do_yuv_to_rgb

	1847 .purgem do_yuv_to_rgb_stage1

	1848 .purgem do_yuv_to_rgb_stage2

	1849 .purgem do_yuv_to_rgb_stage2_store_load_stage1

	1850 .endm

	1851

	1852 /--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize /

	1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b

	1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b

	1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b

	1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b

	1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b

	1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b

	1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b

	1860 .purgem do_load

	1861 .purgem do_store

OLD	NEW

« no previous file with comments | « simd/jsimd_arm64.c ('k') | no next file » | no next file with comments »