third_party/libjpeg_turbo/simd/jsimd_arm_neon.S - Issue 7554002: Updates libjpeg-turbo to 1.1.90 (r677)

Side by Side Diff: third_party/libjpeg_turbo/simd/jsimd_arm_neon.S

Issue 7554002: Updates libjpeg-turbo to 1.1.90 (r677) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/

Patch Set: '' Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 * ARM NEON optimizations for libjpeg-turbo

	3 *

	4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).

	5 * All rights reserved.

	6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>

	7 *

	8 * This software is provided 'as-is', without any express or implied

	9 * warranty. In no event will the authors be held liable for any damages

	10 * arising from the use of this software.

	11 *

	12 * Permission is granted to anyone to use this software for any purpose,

	13 * including commercial applications, and to alter it and redistribute it

	14 * freely, subject to the following restrictions:

	15 *

	16 * 1. The origin of this software must not be misrepresented; you must not

	17 * claim that you wrote the original software. If you use this software

	18 * in a product, an acknowledgment in the product documentation would be

	19 * appreciated but is not required.

	20 * 2. Altered source versions must be plainly marked as such, and must not be

	21 * misrepresented as being the original software.

	22 * 3. This notice may not be removed or altered from any source distribution.

	23 */

	24

	25 #if defined(__linux__) && defined(__ELF__)

	26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */

	27 #endif

	28

	29 .text

	30 .fpu neon

	31 .arch armv7a

	32 .object_arch armv4

	33 .arm

	34

	35

	36 #define RESPECT_STRICT_ALIGNMENT 1

	37

	38 /*****************************************************************************/

	39

	40 /* Supplementary macro for setting function attributes */

	41 .macro asm_function fname

	42 #ifdef __APPLE__

	43 .func _\fname

	44 .globl _\fname

	45 _\fname:

	46 #else

	47 .func \fname

	48 .global \fname

	49 #ifdef __ELF__

	50 .hidden \fname

	51 .type \fname, %function

	52 #endif

	53 \fname:

	54 #endif

	55 .endm

	56

	57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */

	58 .macro transpose_4x4 x0, x1, x2, x3

	59 vtrn.16 \x0, \x1

	60 vtrn.16 \x2, \x3

	61 vtrn.32 \x0, \x2

	62 vtrn.32 \x1, \x3

	63 .endm

	64

	65 /*****************************************************************************/

	66

	67 /*

	68 * jsimd_idct_ifast_neon

	69 *

	70 * This function contains a fast, not so accurate integer implementation of

	71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations

	72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast'

	73 * function from jidctfst.c

	74 *

	75 * TODO: a bit better instructions scheduling is needed.

	76 */

	77

	78 #define XFIX_1_082392200 d0[0]

	79 #define XFIX_1_414213562 d0[1]

	80 #define XFIX_1_847759065 d0[2]

	81 #define XFIX_2_613125930 d0[3]

	82

	83 .balign 16

	84 jsimd_idct_ifast_neon_consts:

	85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */

	86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */

	87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */

	88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */

	89

	90 /* 1-D IDCT helper macro */

	91

	92 .macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \

	93 t10, t11, t12, t13, t14

	94

	95 vsub.s16 \t10, \x0, \x4

	96 vadd.s16 \x4, \x0, \x4

	97 vswp.s16 \t10, \x0

	98 vsub.s16 \t11, \x2, \x6

	99 vadd.s16 \x6, \x2, \x6

	100 vswp.s16 \t11, \x2

	101 vsub.s16 \t10, \x3, \x5

	102 vadd.s16 \x5, \x3, \x5

	103 vswp.s16 \t10, \x3

	104 vsub.s16 \t11, \x1, \x7

	105 vadd.s16 \x7, \x1, \x7

	106 vswp.s16 \t11, \x1

	107

	108 vqdmulh.s16 \t13, \x2, d0[1]

	109 vadd.s16 \t12, \x3, \x3

	110 vadd.s16 \x2, \x2, \t13

	111 vqdmulh.s16 \t13, \x3, d0[3]

	112 vsub.s16 \t10, \x1, \x3

	113 vadd.s16 \t12, \t12, \t13

	114 vqdmulh.s16 \t13, \t10, d0[2]

	115 vsub.s16 \t11, \x7, \x5

	116 vadd.s16 \t10, \t10, \t13

	117 vqdmulh.s16 \t13, \t11, d0[1]

	118 vadd.s16 \t11, \t11, \t13

	119

	120 vqdmulh.s16 \t13, \x1, d0[0]

	121 vsub.s16 \x2, \x6, \x2

	122 vsub.s16 \t14, \x0, \x2

	123 vadd.s16 \x2, \x0, \x2

	124 vadd.s16 \x0, \x4, \x6

	125 vsub.s16 \x4, \x4, \x6

	126 vadd.s16 \x1, \x1, \t13

	127 vadd.s16 \t13, \x7, \x5

	128 vsub.s16 \t12, \t13, \t12

	129 vsub.s16 \t12, \t12, \t10

	130 vadd.s16 \t11, \t12, \t11

	131 vsub.s16 \t10, \x1, \t10

	132 vadd.s16 \t10, \t10, \t11

	133

	134 vsub.s16 \x7, \x0, \t13

	135 vadd.s16 \x0, \x0, \t13

	136 vadd.s16 \x6, \t14, \t12

	137 vsub.s16 \x1, \t14, \t12

	138 vsub.s16 \x5, \x2, \t11

	139 vadd.s16 \x2, \x2, \t11

	140 vsub.s16 \x3, \x4, \t10

	141 vadd.s16 \x4, \x4, \t10

	142 .endm

	143

	144 asm_function jsimd_idct_ifast_neon

	145

	146 DCT_TABLE .req r0

	147 COEF_BLOCK .req r1

	148 OUTPUT_BUF .req r2

	149 OUTPUT_COL .req r3

	150 TMP .req ip

	151

	152 vpush {d8-d15}

	153

	154 /* Load constants */

	155 adr TMP, jsimd_idct_ifast_neon_consts

	156 vld1.16 {d0}, [TMP, :64]

	157

	158 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	159 * 0 1 2 3 \| 4 5 6 7

	160 * ---------+--------

	161 * 0 \| d4 \| d5

	162 * 1 \| d6 \| d7

	163 * 2 \| d8 \| d9

	164 * 3 \| d10 \| d11

	165 * 4 \| d12 \| d13

	166 * 5 \| d14 \| d15

	167 * 6 \| d16 \| d17

	168 * 7 \| d18 \| d19

	169 */

	170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!

	171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!

	172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!

	173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!

	174 /* Dequantize */

	175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!

	176 vmul.s16 q2, q2, q10

	177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!

	178 vmul.s16 q3, q3, q11

	179 vmul.s16 q4, q4, q12

	180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!

	181 vmul.s16 q5, q5, q13

	182 vmul.s16 q6, q6, q14

	183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!

	184 vmul.s16 q7, q7, q15

	185 vmul.s16 q8, q8, q10

	186 vmul.s16 q9, q9, q11

	187

	188 /* Pass 1 */

	189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14

	190 /* Transpose */

	191 transpose_4x4 d4, d6, d8, d10

	192 transpose_4x4 d5, d7, d9, d11

	193 transpose_4x4 d12, d14, d16, d18

	194 transpose_4x4 d13, d15, d17, d19

	195 vswp d12, d5

	196 vswp d14, d7

	197 vswp d16, d9

	198 vswp d18, d11

	199

	200 /* Pass 2 */

	201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14

	202 /* Transpose */

	203 transpose_4x4 d4, d6, d8, d10

	204 transpose_4x4 d5, d7, d9, d11

	205 transpose_4x4 d12, d14, d16, d18

	206 transpose_4x4 d13, d15, d17, d19

	207 vswp d12, d5

	208 vswp d14, d7

	209 vswp d16, d9

	210 vswp d18, d11

	211

	212 /* Descale and range limit */

	213 vmov.s16 q15, #(0x80 << 5)

	214 vqadd.s16 q2, q2, q15

	215 vqadd.s16 q3, q3, q15

	216 vqadd.s16 q4, q4, q15

	217 vqadd.s16 q5, q5, q15

	218 vqadd.s16 q6, q6, q15

	219 vqadd.s16 q7, q7, q15

	220 vqadd.s16 q8, q8, q15

	221 vqadd.s16 q9, q9, q15

	222 vqshrun.s16 d4, q2, #5

	223 vqshrun.s16 d6, q3, #5

	224 vqshrun.s16 d8, q4, #5

	225 vqshrun.s16 d10, q5, #5

	226 vqshrun.s16 d12, q6, #5

	227 vqshrun.s16 d14, q7, #5

	228 vqshrun.s16 d16, q8, #5

	229 vqshrun.s16 d18, q9, #5

	230

	231 /* Store results to the output buffer */

	232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18

	233 ldr TMP, [OUTPUT_BUF], #4

	234 add TMP, TMP, OUTPUT_COL

	235 vst1.8 {\x}, [TMP]!

	236 .endr

	237

	238 vpop {d8-d15}

	239 bx lr

	240

	241 .unreq DCT_TABLE

	242 .unreq COEF_BLOCK

	243 .unreq OUTPUT_BUF

	244 .unreq OUTPUT_COL

	245 .unreq TMP

	246 .endfunc

	247

	248 .purgem idct_helper

	249

	250 /*****************************************************************************/

	251

	252 /*

	253 * jsimd_idct_4x4_neon

	254 *

	255 * This function contains inverse-DCT code for getting reduced-size

	256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations

	257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'

	258 * function from jpeg-6b (jidctred.c).

	259 *

	260 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which

	261 * requires much less arithmetic operations and hence should be faster.

	262 * The primary purpose of this particular NEON optimized function is

	263 * bit exact compatibility with jpeg-6b.

	264 *

	265 * TODO: a bit better instructions scheduling can be achieved by expanding

	266 * idct_helper/transpose_4x4 macros and reordering instructions,

	267 * but readability will suffer somewhat.

	268 */

	269

	270 #define CONST_BITS 13

	271

	272 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */

	273 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */

	274 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */

	275 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */

	276 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */

	277 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */

	278 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */

	279 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */

	280 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */

	281 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */

	282 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */

	283 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */

	284 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */

	285 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */

	286

	287 .balign 16

	288 jsimd_idct_4x4_neon_consts:

	289 .short FIX_1_847759065 /* d0[0] */

	290 .short -FIX_0_765366865 /* d0[1] */

	291 .short -FIX_0_211164243 /* d0[2] */

	292 .short FIX_1_451774981 /* d0[3] */

	293 .short -FIX_2_172734803 /* d1[0] */

	294 .short FIX_1_061594337 /* d1[1] */

	295 .short -FIX_0_509795579 /* d1[2] */

	296 .short -FIX_0_601344887 /* d1[3] */

	297 .short FIX_0_899976223 /* d2[0] */

	298 .short FIX_2_562915447 /* d2[1] */

	299 .short 1 << (CONST_BITS+1) /* d2[2] */

	300 .short 0 /* d2[3] */

	301

	302 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29

	303 vmull.s16 q14, \x4, d2[2]

	304 vmlal.s16 q14, \x8, d0[0]

	305 vmlal.s16 q14, \x14, d0[1]

	306

	307 vmull.s16 q13, \x16, d1[2]

	308 vmlal.s16 q13, \x12, d1[3]

	309 vmlal.s16 q13, \x10, d2[0]

	310 vmlal.s16 q13, \x6, d2[1]

	311

	312 vmull.s16 q15, \x4, d2[2]

	313 vmlsl.s16 q15, \x8, d0[0]

	314 vmlsl.s16 q15, \x14, d0[1]

	315

	316 vmull.s16 q12, \x16, d0[2]

	317 vmlal.s16 q12, \x12, d0[3]

	318 vmlal.s16 q12, \x10, d1[0]

	319 vmlal.s16 q12, \x6, d1[1]

	320

	321 vadd.s32 q10, q14, q13

	322 vsub.s32 q14, q14, q13

	323

	324 .if \shift > 16

	325 vrshr.s32 q10, q10, #\shift

	326 vrshr.s32 q14, q14, #\shift

	327 vmovn.s32 \y26, q10

	328 vmovn.s32 \y29, q14

	329 .else

	330 vrshrn.s32 \y26, q10, #\shift

	331 vrshrn.s32 \y29, q14, #\shift

	332 .endif

	333

	334 vadd.s32 q10, q15, q12

	335 vsub.s32 q15, q15, q12

	336

	337 .if \shift > 16

	338 vrshr.s32 q10, q10, #\shift

	339 vrshr.s32 q15, q15, #\shift

	340 vmovn.s32 \y27, q10

	341 vmovn.s32 \y28, q15

	342 .else

	343 vrshrn.s32 \y27, q10, #\shift

	344 vrshrn.s32 \y28, q15, #\shift

	345 .endif

	346

	347 .endm

	348

	349 asm_function jsimd_idct_4x4_neon

	350

	351 DCT_TABLE .req r0

	352 COEF_BLOCK .req r1

	353 OUTPUT_BUF .req r2

	354 OUTPUT_COL .req r3

	355 TMP1 .req r0

	356 TMP2 .req r1

	357 TMP3 .req r2

	358 TMP4 .req ip

	359

	360 vpush {d8-d15}

	361

	362 /* Load constants (d3 is just used for padding) */

	363 adr TMP4, jsimd_idct_4x4_neon_consts

	364 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]

	365

	366 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	367 * 0 1 2 3 \| 4 5 6 7

	368 * ---------+--------

	369 * 0 \| d4 \| d5

	370 * 1 \| d6 \| d7

	371 * 2 \| d8 \| d9

	372 * 3 \| d10 \| d11

	373 * 4 \| - \| -

	374 * 5 \| d12 \| d13

	375 * 6 \| d14 \| d15

	376 * 7 \| d16 \| d17

	377 */

	378 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!

	379 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!

	380 add COEF_BLOCK, COEF_BLOCK, #16

	381 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!

	382 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!

	383 /* dequantize */

	384 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!

	385 vmul.s16 q2, q2, q9

	386 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!

	387 vmul.s16 q3, q3, q10

	388 vmul.s16 q4, q4, q11

	389 add DCT_TABLE, DCT_TABLE, #16

	390 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!

	391 vmul.s16 q5, q5, q12

	392 vmul.s16 q6, q6, q13

	393 vld1.16 {d30, d31}, [DCT_TABLE, :128]!

	394 vmul.s16 q7, q7, q14

	395 vmul.s16 q8, q8, q15

	396

	397 /* Pass 1 */

	398 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10

	399 transpose_4x4 d4, d6, d8, d10

	400 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11

	401 transpose_4x4 d5, d7, d9, d11

	402

	403 /* Pass 2 */

	404 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29

	405 transpose_4x4 d26, d27, d28, d29

	406

	407 /* Range limit */

	408 vmov.u16 q15, #0x80

	409 vadd.s16 q13, q13, q15

	410 vadd.s16 q14, q14, q15

	411 vqmovun.s16 d26, q13

	412 vqmovun.s16 d27, q14

	413

	414 /* Store results to the output buffer */

	415 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}

	416 add TMP1, TMP1, OUTPUT_COL

	417 add TMP2, TMP2, OUTPUT_COL

	418 add TMP3, TMP3, OUTPUT_COL

	419 add TMP4, TMP4, OUTPUT_COL

	420

	421 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT

	422 /* We can use much less instructions on little endian systems if the

	423 * OS kernel is not configured to trap unaligned memory accesses

	424 */

	425 vst1.32 {d26[0]}, [TMP1]!

	426 vst1.32 {d27[0]}, [TMP3]!

	427 vst1.32 {d26[1]}, [TMP2]!

	428 vst1.32 {d27[1]}, [TMP4]!

	429 #else

	430 vst1.8 {d26[0]}, [TMP1]!

	431 vst1.8 {d27[0]}, [TMP3]!

	432 vst1.8 {d26[1]}, [TMP1]!

	433 vst1.8 {d27[1]}, [TMP3]!

	434 vst1.8 {d26[2]}, [TMP1]!

	435 vst1.8 {d27[2]}, [TMP3]!

	436 vst1.8 {d26[3]}, [TMP1]!

	437 vst1.8 {d27[3]}, [TMP3]!

	438

	439 vst1.8 {d26[4]}, [TMP2]!

	440 vst1.8 {d27[4]}, [TMP4]!

	441 vst1.8 {d26[5]}, [TMP2]!

	442 vst1.8 {d27[5]}, [TMP4]!

	443 vst1.8 {d26[6]}, [TMP2]!

	444 vst1.8 {d27[6]}, [TMP4]!

	445 vst1.8 {d26[7]}, [TMP2]!

	446 vst1.8 {d27[7]}, [TMP4]!

	447 #endif

	448

	449 vpop {d8-d15}

	450 bx lr

	451

	452 .unreq DCT_TABLE

	453 .unreq COEF_BLOCK

	454 .unreq OUTPUT_BUF

	455 .unreq OUTPUT_COL

	456 .unreq TMP1

	457 .unreq TMP2

	458 .unreq TMP3

	459 .unreq TMP4

	460 .endfunc

	461

	462 .purgem idct_helper

	463

	464 /*****************************************************************************/

	465

	466 /*

	467 * jsimd_idct_2x2_neon

	468 *

	469 * This function contains inverse-DCT code for getting reduced-size

	470 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations

	471 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'

	472 * function from jpeg-6b (jidctred.c).

	473 *

	474 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which

	475 * requires much less arithmetic operations and hence should be faster.

	476 * The primary purpose of this particular NEON optimized function is

	477 * bit exact compatibility with jpeg-6b.

	478 */

	479

	480 .balign 8

	481 jsimd_idct_2x2_neon_consts:

	482 .short -FIX_0_720959822 /* d0[0] */

	483 .short FIX_0_850430095 /* d0[1] */

	484 .short -FIX_1_272758580 /* d0[2] */

	485 .short FIX_3_624509785 /* d0[3] */

	486

	487 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27

	488 vshll.s16 q14, \x4, #15

	489 vmull.s16 q13, \x6, d0[3]

	490 vmlal.s16 q13, \x10, d0[2]

	491 vmlal.s16 q13, \x12, d0[1]

	492 vmlal.s16 q13, \x16, d0[0]

	493

	494 vadd.s32 q10, q14, q13

	495 vsub.s32 q14, q14, q13

	496

	497 .if \shift > 16

	498 vrshr.s32 q10, q10, #\shift

	499 vrshr.s32 q14, q14, #\shift

	500 vmovn.s32 \y26, q10

	501 vmovn.s32 \y27, q14

	502 .else

	503 vrshrn.s32 \y26, q10, #\shift

	504 vrshrn.s32 \y27, q14, #\shift

	505 .endif

	506

	507 .endm

	508

	509 asm_function jsimd_idct_2x2_neon

	510

	511 DCT_TABLE .req r0

	512 COEF_BLOCK .req r1

	513 OUTPUT_BUF .req r2

	514 OUTPUT_COL .req r3

	515 TMP1 .req r0

	516 TMP2 .req ip

	517

	518 vpush {d8-d15}

	519

	520 /* Load constants */

	521 adr TMP2, jsimd_idct_2x2_neon_consts

	522 vld1.16 {d0}, [TMP2, :64]

	523

	524 /* Load all COEF_BLOCK into NEON registers with the following allocation:

	525 * 0 1 2 3 \| 4 5 6 7

	526 * ---------+--------

	527 * 0 \| d4 \| d5

	528 * 1 \| d6 \| d7

	529 * 2 \| - \| -

	530 * 3 \| d10 \| d11

	531 * 4 \| - \| -

	532 * 5 \| d12 \| d13

	533 * 6 \| - \| -

	534 * 7 \| d16 \| d17

	535 */

	536 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!

	537 add COEF_BLOCK, COEF_BLOCK, #16

	538 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!

	539 add COEF_BLOCK, COEF_BLOCK, #16

	540 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!

	541 add COEF_BLOCK, COEF_BLOCK, #16

	542 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!

	543 /* Dequantize */

	544 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!

	545 vmul.s16 q2, q2, q9

	546 vmul.s16 q3, q3, q10

	547 add DCT_TABLE, DCT_TABLE, #16

	548 vld1.16 {d24, d25}, [DCT_TABLE, :128]!

	549 vmul.s16 q5, q5, q12

	550 add DCT_TABLE, DCT_TABLE, #16

	551 vld1.16 {d26, d27}, [DCT_TABLE, :128]!

	552 vmul.s16 q6, q6, q13

	553 add DCT_TABLE, DCT_TABLE, #16

	554 vld1.16 {d30, d31}, [DCT_TABLE, :128]!

	555 vmul.s16 q8, q8, q15

	556

	557 /* Pass 1 */

	558 #if 0

	559 idct_helper d4, d6, d10, d12, d16, 13, d4, d6

	560 transpose_4x4 d4, d6, d8, d10

	561 idct_helper d5, d7, d11, d13, d17, 13, d5, d7

	562 transpose_4x4 d5, d7, d9, d11

	563 #else

	564 vmull.s16 q13, d6, d0[3]

	565 vmlal.s16 q13, d10, d0[2]

	566 vmlal.s16 q13, d12, d0[1]

	567 vmlal.s16 q13, d16, d0[0]

	568 vmull.s16 q12, d7, d0[3]

	569 vmlal.s16 q12, d11, d0[2]

	570 vmlal.s16 q12, d13, d0[1]

	571 vmlal.s16 q12, d17, d0[0]

	572 vshll.s16 q14, d4, #15

	573 vshll.s16 q15, d5, #15

	574 vadd.s32 q10, q14, q13

	575 vsub.s32 q14, q14, q13

	576 vrshrn.s32 d4, q10, #13

	577 vrshrn.s32 d6, q14, #13

	578 vadd.s32 q10, q15, q12

	579 vsub.s32 q14, q15, q12

	580 vrshrn.s32 d5, q10, #13

	581 vrshrn.s32 d7, q14, #13

	582 vtrn.16 q2, q3

	583 vtrn.32 q3, q5

	584 #endif

	585

	586 /* Pass 2 */

	587 idct_helper d4, d6, d10, d7, d11, 20, d26, d27

	588

	589 /* Range limit */

	590 vmov.u16 q15, #0x80

	591 vadd.s16 q13, q13, q15

	592 vqmovun.s16 d26, q13

	593 vqmovun.s16 d27, q13

	594

	595 /* Store results to the output buffer */

	596 ldmia OUTPUT_BUF, {TMP1, TMP2}

	597 add TMP1, TMP1, OUTPUT_COL

	598 add TMP2, TMP2, OUTPUT_COL

	599

	600 vst1.8 {d26[0]}, [TMP1]!

	601 vst1.8 {d27[4]}, [TMP1]!

	602 vst1.8 {d26[1]}, [TMP2]!

	603 vst1.8 {d27[5]}, [TMP2]!

	604

	605 vpop {d8-d15}

	606 bx lr

	607

	608 .unreq DCT_TABLE

	609 .unreq COEF_BLOCK

	610 .unreq OUTPUT_BUF

	611 .unreq OUTPUT_COL

	612 .unreq TMP1

	613 .unreq TMP2

	614 .endfunc

	615

	616 .purgem idct_helper

	617

	618 /*****************************************************************************/

	619

	620 /*

	621 * jsimd_ycc_extrgb_convert_neon

	622 * jsimd_ycc_extbgr_convert_neon

	623 * jsimd_ycc_extrgbx_convert_neon

	624 * jsimd_ycc_extbgrx_convert_neon

	625 * jsimd_ycc_extxbgr_convert_neon

	626 * jsimd_ycc_extxrgb_convert_neon

	627 *

	628 * Colorspace conversion YCbCr -> RGB

	629 */

	630

	631

	632 .macro do_load size

	633 .if \size == 8

	634 vld1.8 {d4}, [U]!

	635 vld1.8 {d5}, [V]!

	636 vld1.8 {d0}, [Y]!

	637 pld [Y, #64]

	638 pld [U, #64]

	639 pld [V, #64]

	640 .elseif \size == 4

	641 vld1.8 {d4[0]}, [U]!

	642 vld1.8 {d4[1]}, [U]!

	643 vld1.8 {d4[2]}, [U]!

	644 vld1.8 {d4[3]}, [U]!

	645 vld1.8 {d5[0]}, [V]!

	646 vld1.8 {d5[1]}, [V]!

	647 vld1.8 {d5[2]}, [V]!

	648 vld1.8 {d5[3]}, [V]!

	649 vld1.8 {d0[0]}, [Y]!

	650 vld1.8 {d0[1]}, [Y]!

	651 vld1.8 {d0[2]}, [Y]!

	652 vld1.8 {d0[3]}, [Y]!

	653 .elseif \size == 2

	654 vld1.8 {d4[4]}, [U]!

	655 vld1.8 {d4[5]}, [U]!

	656 vld1.8 {d5[4]}, [V]!

	657 vld1.8 {d5[5]}, [V]!

	658 vld1.8 {d0[4]}, [Y]!

	659 vld1.8 {d0[5]}, [Y]!

	660 .elseif \size == 1

	661 vld1.8 {d4[6]}, [U]!

	662 vld1.8 {d5[6]}, [V]!

	663 vld1.8 {d0[6]}, [Y]!

	664 .else

	665 .error unsupported macroblock size

	666 .endif

	667 .endm

	668

	669 .macro do_store bpp, size

	670 .if \bpp == 24

	671 .if \size == 8

	672 vst3.8 {d10, d11, d12}, [RGB]!

	673 .elseif \size == 4

	674 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!

	675 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!

	676 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!

	677 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!

	678 .elseif \size == 2

	679 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!

	680 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!

	681 .elseif \size == 1

	682 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!

	683 .else

	684 .error unsupported macroblock size

	685 .endif

	686 .elseif \bpp == 32

	687 .if \size == 8

	688 vst4.8 {d10, d11, d12, d13}, [RGB]!

	689 .elseif \size == 4

	690 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!

	691 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!

	692 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!

	693 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!

	694 .elseif \size == 2

	695 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!

	696 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!

	697 .elseif \size == 1

	698 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!

	699 .else

	700 .error unsupported macroblock size

	701 .endif

	702 .else

	703 .error unsupported bpp

	704 .endif

	705 .endm

	706

	707 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs

	708

	709 .macro do_yuv_to_rgb

	710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */

	711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */

	712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */

	713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */

	714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */

	715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */

	716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */

	717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */

	718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */

	719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */

	720 vrshrn.s32 d20, q10, #15

	721 vrshrn.s32 d21, q11, #15

	722 vrshrn.s32 d24, q12, #14

	723 vrshrn.s32 d25, q13, #14

	724 vrshrn.s32 d28, q14, #14

	725 vrshrn.s32 d29, q15, #14

	726 vaddw.u8 q10, q10, d0

	727 vaddw.u8 q12, q12, d0

	728 vaddw.u8 q14, q14, d0

	729 vqmovun.s16 d1\g_offs, q10

	730 vqmovun.s16 d1\r_offs, q12

	731 vqmovun.s16 d1\b_offs, q14

	732 .endm

	733

	734 /* Apple gas crashes on adrl, work around that by using adr.

	735 * But this requires a copy of these constants for each function.

	736 */

	737

	738 .balign 16

	739 jsimd_ycc_\colorid\()_neon_consts:

	740 .short 0, 0, 0, 0

	741 .short 22971, -11277, -23401, 29033

	742 .short -128, -128, -128, -128

	743 .short -128, -128, -128, -128

	744

	745 asm_function jsimd_ycc_\colorid\()_convert_neon

	746 OUTPUT_WIDTH .req r0

	747 INPUT_BUF .req r1

	748 INPUT_ROW .req r2

	749 OUTPUT_BUF .req r3

	750 NUM_ROWS .req r4

	751

	752 INPUT_BUF0 .req r5

	753 INPUT_BUF1 .req r6

	754 INPUT_BUF2 .req INPUT_BUF

	755

	756 RGB .req r7

	757 Y .req r8

	758 U .req r9

	759 V .req r10

	760 N .req ip

	761

	762 /* Load constants to d1, d2, d3 (d0 is just used for padding) */

	763 adr ip, jsimd_ycc_\colorid\()_neon_consts

	764 vld1.16 {d0, d1, d2, d3}, [ip, :128]

	765

	766 /* Save ARM registers and handle input arguments */

	767 push {r4, r5, r6, r7, r8, r9, r10, lr}

	768 ldr NUM_ROWS, [sp, #(4 * 8)]

	769 ldr INPUT_BUF0, [INPUT_BUF]

	770 ldr INPUT_BUF1, [INPUT_BUF, #4]

	771 ldr INPUT_BUF2, [INPUT_BUF, #8]

	772 .unreq INPUT_BUF

	773

	774 /* Save NEON registers */

	775 vpush {d8-d15}

	776

	777 /* Initially set d10, d11, d12, d13 to 0xFF */

	778 vmov.u8 q5, #255

	779 vmov.u8 q6, #255

	780

	781 /* Outer loop over scanlines */

	782 cmp NUM_ROWS, #1

	783 blt 9f

	784 0:

	785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]

	786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]

	787 mov N, OUTPUT_WIDTH

	788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]

	789 add INPUT_ROW, INPUT_ROW, #1

	790 ldr RGB, [OUTPUT_BUF], #4

	791

	792 /* Inner loop over pixels */

	793 subs N, N, #8

	794 blt 2f

	795 1:

	796 do_load 8

	797 do_yuv_to_rgb

	798 do_store \bpp, 8

	799 subs N, N, #8

	800 bge 1b

	801 tst N, #7

	802 beq 8f

	803 2:

	804 tst N, #4

	805 beq 3f

	806 do_load 4

	807 3:

	808 tst N, #2

	809 beq 4f

	810 do_load 2

	811 4:

	812 tst N, #1

	813 beq 5f

	814 do_load 1

	815 5:

	816 do_yuv_to_rgb

	817 tst N, #4

	818 beq 6f

	819 do_store \bpp, 4

	820 6:

	821 tst N, #2

	822 beq 7f

	823 do_store \bpp, 2

	824 7:

	825 tst N, #1

	826 beq 8f

	827 do_store \bpp, 1

	828 8:

	829 subs NUM_ROWS, NUM_ROWS, #1

	830 bgt 0b

	831 9:

	832 /* Restore all registers and return */

	833 vpop {d8-d15}

	834 pop {r4, r5, r6, r7, r8, r9, r10, pc}

	835

	836 .unreq OUTPUT_WIDTH

	837 .unreq INPUT_ROW

	838 .unreq OUTPUT_BUF

	839 .unreq NUM_ROWS

	840 .unreq INPUT_BUF0

	841 .unreq INPUT_BUF1

	842 .unreq INPUT_BUF2

	843 .unreq RGB

	844 .unreq Y

	845 .unreq U

	846 .unreq V

	847 .unreq N

	848 .endfunc

	849

	850 .purgem do_yuv_to_rgb

	851

	852 .endm

	853

	854 /--------------------------------- id ----- bpp R G B /

	855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2

	856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0

	857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2

	858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0

	859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1

	860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3

	861

	862 .purgem do_load

	863 .purgem do_store

	864

	865 /*****************************************************************************/

OLD	NEW

« no previous file with comments | « third_party/libjpeg_turbo/simd/jsimd_arm.c ('k') | third_party/libjpeg_turbo/simd/jsimd_i386.c » ('j') | no next file with comments »