source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm - Issue 168343002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 ;

	2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 ;

	4 ; Use of this source code is governed by a BSD-style license

	5 ; that can be found in the LICENSE file in the root of the source

	6 ; tree. An additional intellectual property rights grant can be found

	7 ; in the file PATENTS. All contributing project authors may

	8 ; be found in the AUTHORS file in the root of the source tree.

	9 ;

	10

	11 EXPORT \|vp9_iht8x8_64_add_neon\|

	12 ARM

	13 REQUIRE8

	14 PRESERVE8

	15

	16 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

	17

	18 ; Generate IADST constants in r0 - r12 for the IADST.

	19 MACRO

	20 GENERATE_IADST_CONSTANTS

	21 ; generate cospi_2_64 = 16305

	22 mov r0, #0x3f00

	23 add r0, #0xb1

	24

	25 ; generate cospi_30_64 = 1606

	26 mov r1, #0x600

	27 add r1, #0x46

	28

	29 ; generate cospi_10_64 = 14449

	30 mov r2, #0x3800

	31 add r2, #0x71

	32

	33 ; generate cospi_22_64 = 7723

	34 mov r3, #0x1e00

	35 add r3, #0x2b

	36

	37 ; generate cospi_18_64 = 10394

	38 mov r4, #0x2800

	39 add r4, #0x9a

	40

	41 ; generate cospi_14_64 = 12665

	42 mov r5, #0x3100

	43 add r5, #0x79

	44

	45 ; generate cospi_26_64 = 4756

	46 mov r6, #0x1200

	47 add r6, #0x94

	48

	49 ; generate cospi_6_64 = 15679

	50 mov r7, #0x3d00

	51 add r7, #0x3f

	52

	53 ; generate cospi_8_64 = 15137

	54 mov r8, #0x3b00

	55 add r8, #0x21

	56

	57 ; generate cospi_24_64 = 6270

	58 mov r9, #0x1800

	59 add r9, #0x7e

	60

	61 ; generate 0

	62 mov r10, #0

	63

	64 ; generate cospi_16_64 = 11585

	65 mov r12, #0x2d00

	66 add r12, #0x41

	67 MEND

	68

	69 ; Generate IDCT constants in r3 - r9 for the IDCT.

	70 MACRO

	71 GENERATE_IDCT_CONSTANTS

	72 ; generate cospi_28_64 = 3196

	73 mov r3, #0x0c00

	74 add r3, #0x7c

	75

	76 ; generate cospi_4_64 = 16069

	77 mov r4, #0x3e00

	78 add r4, #0xc5

	79

	80 ; generate cospi_12_64 = 13623

	81 mov r5, #0x3500

	82 add r5, #0x37

	83

	84 ; generate cospi_20_64 = 9102

	85 mov r6, #0x2300

	86 add r6, #0x8e

	87

	88 ; generate cospi_16_64 = 11585

	89 mov r7, #0x2d00

	90 add r7, #0x41

	91

	92 ; generate cospi_24_64 = 6270

	93 mov r8, #0x1800

	94 add r8, #0x7e

	95

	96 ; generate cospi_8_64 = 15137

	97 mov r9, #0x3b00

	98 add r9, #0x21

	99 MEND

	100

	101 ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.

	102 MACRO

	103 TRANSPOSE8X8

	104 vswp d17, d24

	105 vswp d23, d30

	106 vswp d21, d28

	107 vswp d19, d26

	108 vtrn.32 q8, q10

	109 vtrn.32 q9, q11

	110 vtrn.32 q12, q14

	111 vtrn.32 q13, q15

	112 vtrn.16 q8, q9

	113 vtrn.16 q10, q11

	114 vtrn.16 q12, q13

	115 vtrn.16 q14, q15

	116 MEND

	117

	118 ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are

	119 ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output

	120 ; will be stored back into q8-q15 registers. This macro will touch q0-q7

	121 ; registers and use them as buffer during calculation.

	122 MACRO

	123 IDCT8x8_1D

	124 ; stage 1

	125 vdup.16 d0, r3 ; duplicate cospi_28_64

	126 vdup.16 d1, r4 ; duplicate cospi_4_64

	127 vdup.16 d2, r5 ; duplicate cospi_12_64

	128 vdup.16 d3, r6 ; duplicate cospi_20_64

	129

	130 ; input[1] * cospi_28_64

	131 vmull.s16 q2, d18, d0

	132 vmull.s16 q3, d19, d0

	133

	134 ; input[5] * cospi_12_64

	135 vmull.s16 q5, d26, d2

	136 vmull.s16 q6, d27, d2

	137

	138 ; input[1]cospi_28_64-input[7]cospi_4_64

	139 vmlsl.s16 q2, d30, d1

	140 vmlsl.s16 q3, d31, d1

	141

	142 ; input[5] * cospi_12_64 - input[3] * cospi_20_64

	143 vmlsl.s16 q5, d22, d3

	144 vmlsl.s16 q6, d23, d3

	145

	146 ; dct_const_round_shift(input_dc * cospi_16_64)

	147 vqrshrn.s32 d8, q2, #14 ; >> 14

	148 vqrshrn.s32 d9, q3, #14 ; >> 14

	149

	150 ; dct_const_round_shift(input_dc * cospi_16_64)

	151 vqrshrn.s32 d10, q5, #14 ; >> 14

	152 vqrshrn.s32 d11, q6, #14 ; >> 14

	153

	154 ; input[1] * cospi_4_64

	155 vmull.s16 q2, d18, d1

	156 vmull.s16 q3, d19, d1

	157

	158 ; input[5] * cospi_20_64

	159 vmull.s16 q9, d26, d3

	160 vmull.s16 q13, d27, d3

	161

	162 ; input[1]cospi_4_64+input[7]cospi_28_64

	163 vmlal.s16 q2, d30, d0

	164 vmlal.s16 q3, d31, d0

	165

	166 ; input[5] * cospi_20_64 + input[3] * cospi_12_64

	167 vmlal.s16 q9, d22, d2

	168 vmlal.s16 q13, d23, d2

	169

	170 ; dct_const_round_shift(input_dc * cospi_16_64)

	171 vqrshrn.s32 d14, q2, #14 ; >> 14

	172 vqrshrn.s32 d15, q3, #14 ; >> 14

	173

	174 ; stage 2 & stage 3 - even half

	175 vdup.16 d0, r7 ; duplicate cospi_16_64

	176

	177 ; dct_const_round_shift(input_dc * cospi_16_64)

	178 vqrshrn.s32 d12, q9, #14 ; >> 14

	179 vqrshrn.s32 d13, q13, #14 ; >> 14

	180

	181 ; input[0] * cospi_16_64

	182 vmull.s16 q2, d16, d0

	183 vmull.s16 q3, d17, d0

	184

	185 ; input[0] * cospi_16_64

	186 vmull.s16 q13, d16, d0

	187 vmull.s16 q15, d17, d0

	188

	189 ; (input[0] + input[2]) * cospi_16_64

	190 vmlal.s16 q2, d24, d0

	191 vmlal.s16 q3, d25, d0

	192

	193 ; (input[0] - input[2]) * cospi_16_64

	194 vmlsl.s16 q13, d24, d0

	195 vmlsl.s16 q15, d25, d0

	196

	197 vdup.16 d0, r8 ; duplicate cospi_24_64

	198 vdup.16 d1, r9 ; duplicate cospi_8_64

	199

	200 ; dct_const_round_shift(input_dc * cospi_16_64)

	201 vqrshrn.s32 d18, q2, #14 ; >> 14

	202 vqrshrn.s32 d19, q3, #14 ; >> 14

	203

	204 ; dct_const_round_shift(input_dc * cospi_16_64)

	205 vqrshrn.s32 d22, q13, #14 ; >> 14

	206 vqrshrn.s32 d23, q15, #14 ; >> 14

	207

	208 ; input[1] * cospi_24_64

	209 vmull.s16 q2, d20, d0

	210 vmull.s16 q3, d21, d0

	211

	212 ; input[1] * cospi_8_64

	213 vmull.s16 q8, d20, d1

	214 vmull.s16 q12, d21, d1

	215

	216 ; input[1] * cospi_24_64 - input[3] * cospi_8_64

	217 vmlsl.s16 q2, d28, d1

	218 vmlsl.s16 q3, d29, d1

	219

	220 ; input[1] * cospi_8_64 + input[3] * cospi_24_64

	221 vmlal.s16 q8, d28, d0

	222 vmlal.s16 q12, d29, d0

	223

	224 ; dct_const_round_shift(input_dc * cospi_16_64)

	225 vqrshrn.s32 d26, q2, #14 ; >> 14

	226 vqrshrn.s32 d27, q3, #14 ; >> 14

	227

	228 ; dct_const_round_shift(input_dc * cospi_16_64)

	229 vqrshrn.s32 d30, q8, #14 ; >> 14

	230 vqrshrn.s32 d31, q12, #14 ; >> 14

	231

	232 vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]

	233 vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]

	234 vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]

	235 vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]

	236

	237 ; stage 3 -odd half

	238 vdup.16 d16, r7 ; duplicate cospi_16_64

	239

	240 ; stage 2 - odd half

	241 vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]

	242 vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]

	243 vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]

	244 vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]

	245

	246 ; step2[6] * cospi_16_64

	247 vmull.s16 q9, d28, d16

	248 vmull.s16 q10, d29, d16

	249

	250 ; step2[6] * cospi_16_64

	251 vmull.s16 q11, d28, d16

	252 vmull.s16 q12, d29, d16

	253

	254 ; (step2[6] - step2[5]) * cospi_16_64

	255 vmlsl.s16 q9, d26, d16

	256 vmlsl.s16 q10, d27, d16

	257

	258 ; (step2[5] + step2[6]) * cospi_16_64

	259 vmlal.s16 q11, d26, d16

	260 vmlal.s16 q12, d27, d16

	261

	262 ; dct_const_round_shift(input_dc * cospi_16_64)

	263 vqrshrn.s32 d10, q9, #14 ; >> 14

	264 vqrshrn.s32 d11, q10, #14 ; >> 14

	265

	266 ; dct_const_round_shift(input_dc * cospi_16_64)

	267 vqrshrn.s32 d12, q11, #14 ; >> 14

	268 vqrshrn.s32 d13, q12, #14 ; >> 14

	269

	270 ; stage 4

	271 vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];

	272 vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];

	273 vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];

	274 vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];

	275 vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];

	276 vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];

	277 vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];

	278 vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];

	279 MEND

	280

	281 ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which

	282 ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The

	283 ; output will be stored back into q8-q15 registers. This macro will touch

	284 ; q0 - q7 registers and use them as buffer during calculation.

	285 MACRO

	286 IADST8X8_1D

	287 vdup.16 d14, r0 ; duplicate cospi_2_64

	288 vdup.16 d15, r1 ; duplicate cospi_30_64

	289

	290 ; cospi_2_64 * x0

	291 vmull.s16 q1, d30, d14

	292 vmull.s16 q2, d31, d14

	293

	294 ; cospi_30_64 * x0

	295 vmull.s16 q3, d30, d15

	296 vmull.s16 q4, d31, d15

	297

	298 vdup.16 d30, r4 ; duplicate cospi_18_64

	299 vdup.16 d31, r5 ; duplicate cospi_14_64

	300

	301 ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

	302 vmlal.s16 q1, d16, d15

	303 vmlal.s16 q2, d17, d15

	304

	305 ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1

	306 vmlsl.s16 q3, d16, d14

	307 vmlsl.s16 q4, d17, d14

	308

	309 ; cospi_18_64 * x4

	310 vmull.s16 q5, d22, d30

	311 vmull.s16 q6, d23, d30

	312

	313 ; cospi_14_64 * x4

	314 vmull.s16 q7, d22, d31

	315 vmull.s16 q8, d23, d31

	316

	317 ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

	318 vmlal.s16 q5, d24, d31

	319 vmlal.s16 q6, d25, d31

	320

	321 ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5

	322 vmlsl.s16 q7, d24, d30

	323 vmlsl.s16 q8, d25, d30

	324

	325 ; (s0 + s4)

	326 vadd.s32 q11, q1, q5

	327 vadd.s32 q12, q2, q6

	328

	329 vdup.16 d0, r2 ; duplicate cospi_10_64

	330 vdup.16 d1, r3 ; duplicate cospi_22_64

	331

	332 ; (s0 - s4)

	333 vsub.s32 q1, q1, q5

	334 vsub.s32 q2, q2, q6

	335

	336 ; x0 = dct_const_round_shift(s0 + s4);

	337 vqrshrn.s32 d22, q11, #14 ; >> 14

	338 vqrshrn.s32 d23, q12, #14 ; >> 14

	339

	340 ; (s1 + s5)

	341 vadd.s32 q12, q3, q7

	342 vadd.s32 q15, q4, q8

	343

	344 ; (s1 - s5)

	345 vsub.s32 q3, q3, q7

	346 vsub.s32 q4, q4, q8

	347

	348 ; x4 = dct_const_round_shift(s0 - s4);

	349 vqrshrn.s32 d2, q1, #14 ; >> 14

	350 vqrshrn.s32 d3, q2, #14 ; >> 14

	351

	352 ; x1 = dct_const_round_shift(s1 + s5);

	353 vqrshrn.s32 d24, q12, #14 ; >> 14

	354 vqrshrn.s32 d25, q15, #14 ; >> 14

	355

	356 ; x5 = dct_const_round_shift(s1 - s5);

	357 vqrshrn.s32 d6, q3, #14 ; >> 14

	358 vqrshrn.s32 d7, q4, #14 ; >> 14

	359

	360 ; cospi_10_64 * x2

	361 vmull.s16 q4, d26, d0

	362 vmull.s16 q5, d27, d0

	363

	364 ; cospi_22_64 * x2

	365 vmull.s16 q2, d26, d1

	366 vmull.s16 q6, d27, d1

	367

	368 vdup.16 d30, r6 ; duplicate cospi_26_64

	369 vdup.16 d31, r7 ; duplicate cospi_6_64

	370

	371 ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

	372 vmlal.s16 q4, d20, d1

	373 vmlal.s16 q5, d21, d1

	374

	375 ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

	376 vmlsl.s16 q2, d20, d0

	377 vmlsl.s16 q6, d21, d0

	378

	379 ; cospi_26_64 * x6

	380 vmull.s16 q0, d18, d30

	381 vmull.s16 q13, d19, d30

	382

	383 ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

	384 vmlal.s16 q0, d28, d31

	385 vmlal.s16 q13, d29, d31

	386

	387 ; cospi_6_64 * x6

	388 vmull.s16 q10, d18, d31

	389 vmull.s16 q9, d19, d31

	390

	391 ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

	392 vmlsl.s16 q10, d28, d30

	393 vmlsl.s16 q9, d29, d30

	394

	395 ; (s3 + s7)

	396 vadd.s32 q14, q2, q10

	397 vadd.s32 q15, q6, q9

	398

	399 ; (s3 - s7)

	400 vsub.s32 q2, q2, q10

	401 vsub.s32 q6, q6, q9

	402

	403 ; x3 = dct_const_round_shift(s3 + s7);

	404 vqrshrn.s32 d28, q14, #14 ; >> 14

	405 vqrshrn.s32 d29, q15, #14 ; >> 14

	406

	407 ; x7 = dct_const_round_shift(s3 - s7);

	408 vqrshrn.s32 d4, q2, #14 ; >> 14

	409 vqrshrn.s32 d5, q6, #14 ; >> 14

	410

	411 ; (s2 + s6)

	412 vadd.s32 q9, q4, q0

	413 vadd.s32 q10, q5, q13

	414

	415 ; (s2 - s6)

	416 vsub.s32 q4, q4, q0

	417 vsub.s32 q5, q5, q13

	418

	419 vdup.16 d30, r8 ; duplicate cospi_8_64

	420 vdup.16 d31, r9 ; duplicate cospi_24_64

	421

	422 ; x2 = dct_const_round_shift(s2 + s6);

	423 vqrshrn.s32 d18, q9, #14 ; >> 14

	424 vqrshrn.s32 d19, q10, #14 ; >> 14

	425

	426 ; x6 = dct_const_round_shift(s2 - s6);

	427 vqrshrn.s32 d8, q4, #14 ; >> 14

	428 vqrshrn.s32 d9, q5, #14 ; >> 14

	429

	430 ; cospi_8_64 * x4

	431 vmull.s16 q5, d2, d30

	432 vmull.s16 q6, d3, d30

	433

	434 ; cospi_24_64 * x4

	435 vmull.s16 q7, d2, d31

	436 vmull.s16 q0, d3, d31

	437

	438 ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

	439 vmlal.s16 q5, d6, d31

	440 vmlal.s16 q6, d7, d31

	441

	442 ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

	443 vmlsl.s16 q7, d6, d30

	444 vmlsl.s16 q0, d7, d30

	445

	446 ; cospi_8_64 * x7

	447 vmull.s16 q1, d4, d30

	448 vmull.s16 q3, d5, d30

	449

	450 ; cospi_24_64 * x7

	451 vmull.s16 q10, d4, d31

	452 vmull.s16 q2, d5, d31

	453

	454 ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

	455 vmlsl.s16 q1, d8, d31

	456 vmlsl.s16 q3, d9, d31

	457

	458 ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

	459 vmlal.s16 q10, d8, d30

	460 vmlal.s16 q2, d9, d30

	461

	462 vadd.s16 q8, q11, q9 ; x0 = s0 + s2;

	463

	464 vsub.s16 q11, q11, q9 ; x2 = s0 - s2;

	465

	466 vadd.s16 q4, q12, q14 ; x1 = s1 + s3;

	467

	468 vsub.s16 q12, q12, q14 ; x3 = s1 - s3;

	469

	470 ; (s4 + s6)

	471 vadd.s32 q14, q5, q1

	472 vadd.s32 q15, q6, q3

	473

	474 ; (s4 - s6)

	475 vsub.s32 q5, q5, q1

	476 vsub.s32 q6, q6, q3

	477

	478 ; x4 = dct_const_round_shift(s4 + s6);

	479 vqrshrn.s32 d18, q14, #14 ; >> 14

	480 vqrshrn.s32 d19, q15, #14 ; >> 14

	481

	482 ; x6 = dct_const_round_shift(s4 - s6);

	483 vqrshrn.s32 d10, q5, #14 ; >> 14

	484 vqrshrn.s32 d11, q6, #14 ; >> 14

	485

	486 ; (s5 + s7)

	487 vadd.s32 q1, q7, q10

	488 vadd.s32 q3, q0, q2

	489

	490 ; (s5 - s7))

	491 vsub.s32 q7, q7, q10

	492 vsub.s32 q0, q0, q2

	493

	494 ; x5 = dct_const_round_shift(s5 + s7);

	495 vqrshrn.s32 d28, q1, #14 ; >> 14

	496 vqrshrn.s32 d29, q3, #14 ; >> 14

	497

	498 ; x7 = dct_const_round_shift(s5 - s7);

	499 vqrshrn.s32 d14, q7, #14 ; >> 14

	500 vqrshrn.s32 d15, q0, #14 ; >> 14

	501

	502 vdup.16 d30, r12 ; duplicate cospi_16_64

	503

	504 ; cospi_16_64 * x2

	505 vmull.s16 q2, d22, d30

	506 vmull.s16 q3, d23, d30

	507

	508 ; cospi_6_64 * x6

	509 vmull.s16 q13, d22, d30

	510 vmull.s16 q1, d23, d30

	511

	512 ; cospi_16_64 * x2 + cospi_16_64 * x3;

	513 vmlal.s16 q2, d24, d30

	514 vmlal.s16 q3, d25, d30

	515

	516 ; cospi_16_64 * x2 - cospi_16_64 * x3;

	517 vmlsl.s16 q13, d24, d30

	518 vmlsl.s16 q1, d25, d30

	519

	520 ; x2 = dct_const_round_shift(s2);

	521 vqrshrn.s32 d4, q2, #14 ; >> 14

	522 vqrshrn.s32 d5, q3, #14 ; >> 14

	523

	524 ;x3 = dct_const_round_shift(s3);

	525 vqrshrn.s32 d24, q13, #14 ; >> 14

	526 vqrshrn.s32 d25, q1, #14 ; >> 14

	527

	528 ; cospi_16_64 * x6

	529 vmull.s16 q13, d10, d30

	530 vmull.s16 q1, d11, d30

	531

	532 ; cospi_6_64 * x6

	533 vmull.s16 q11, d10, d30

	534 vmull.s16 q0, d11, d30

	535

	536 ; cospi_16_64 * x6 + cospi_16_64 * x7;

	537 vmlal.s16 q13, d14, d30

	538 vmlal.s16 q1, d15, d30

	539

	540 ; cospi_16_64 * x6 - cospi_16_64 * x7;

	541 vmlsl.s16 q11, d14, d30

	542 vmlsl.s16 q0, d15, d30

	543

	544 ; x6 = dct_const_round_shift(s6);

	545 vqrshrn.s32 d20, q13, #14 ; >> 14

	546 vqrshrn.s32 d21, q1, #14 ; >> 14

	547

	548 ;x7 = dct_const_round_shift(s7);

	549 vqrshrn.s32 d12, q11, #14 ; >> 14

	550 vqrshrn.s32 d13, q0, #14 ; >> 14

	551

	552 vdup.16 q5, r10 ; duplicate 0

	553

	554 vsub.s16 q9, q5, q9 ; output[1] = -x4;

	555 vsub.s16 q11, q5, q2 ; output[3] = -x2;

	556 vsub.s16 q13, q5, q6 ; output[5] = -x7;

	557 vsub.s16 q15, q5, q4 ; output[7] = -x1;

	558 MEND

	559

	560

	561 AREA Block, CODE, READONLY ; name this block of code

	562 ;void vp9_iht8x8_64_add_neon(int16_t input, uint8_t dest,

	563 ; int dest_stride, int tx_type)

	564 ;

	565 ; r0 int16_t input

	566 ; r1 uint8_t *dest

	567 ; r2 int dest_stride

	568 ; r3 int tx_type)

	569 ; This function will only handle tx_type of 1,2,3.

	570 \|vp9_iht8x8_64_add_neon\| PROC

	571

	572 ; load the inputs into d16-d19

	573 vld1.s16 {q8,q9}, [r0]!

	574 vld1.s16 {q10,q11}, [r0]!

	575 vld1.s16 {q12,q13}, [r0]!

	576 vld1.s16 {q14,q15}, [r0]!

	577

	578 push {r0-r10}

	579

	580 ; transpose the input data

	581 TRANSPOSE8X8

	582

	583 ; decide the type of transform

	584 cmp r3, #2

	585 beq idct_iadst

	586 cmp r3, #3

	587 beq iadst_iadst

	588

	589 iadst_idct

	590 ; generate IDCT constants

	591 GENERATE_IDCT_CONSTANTS

	592

	593 ; first transform rows

	594 IDCT8x8_1D

	595

	596 ; transpose the matrix

	597 TRANSPOSE8X8

	598

	599 ; generate IADST constants

	600 GENERATE_IADST_CONSTANTS

	601

	602 ; then transform columns

	603 IADST8X8_1D

	604

	605 b end_vp9_iht8x8_64_add_neon

	606

	607 idct_iadst

	608 ; generate IADST constants

	609 GENERATE_IADST_CONSTANTS

	610

	611 ; first transform rows

	612 IADST8X8_1D

	613

	614 ; transpose the matrix

	615 TRANSPOSE8X8

	616

	617 ; generate IDCT constants

	618 GENERATE_IDCT_CONSTANTS

	619

	620 ; then transform columns

	621 IDCT8x8_1D

	622

	623 b end_vp9_iht8x8_64_add_neon

	624

	625 iadst_iadst

	626 ; generate IADST constants

	627 GENERATE_IADST_CONSTANTS

	628

	629 ; first transform rows

	630 IADST8X8_1D

	631

	632 ; transpose the matrix

	633 TRANSPOSE8X8

	634

	635 ; then transform columns

	636 IADST8X8_1D

	637

	638 end_vp9_iht8x8_64_add_neon

	639 pop {r0-r10}

	640

	641 ; ROUND_POWER_OF_TWO(temp_out[j], 5)

	642 vrshr.s16 q8, q8, #5

	643 vrshr.s16 q9, q9, #5

	644 vrshr.s16 q10, q10, #5

	645 vrshr.s16 q11, q11, #5

	646 vrshr.s16 q12, q12, #5

	647 vrshr.s16 q13, q13, #5

	648 vrshr.s16 q14, q14, #5

	649 vrshr.s16 q15, q15, #5

	650

	651 ; save dest pointer

	652 mov r0, r1

	653

	654 ; load destination data

	655 vld1.64 {d0}, [r1], r2

	656 vld1.64 {d1}, [r1], r2

	657 vld1.64 {d2}, [r1], r2

	658 vld1.64 {d3}, [r1], r2

	659 vld1.64 {d4}, [r1], r2

	660 vld1.64 {d5}, [r1], r2

	661 vld1.64 {d6}, [r1], r2

	662 vld1.64 {d7}, [r1]

	663

	664 ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]

	665 vaddw.u8 q8, q8, d0

	666 vaddw.u8 q9, q9, d1

	667 vaddw.u8 q10, q10, d2

	668 vaddw.u8 q11, q11, d3

	669 vaddw.u8 q12, q12, d4

	670 vaddw.u8 q13, q13, d5

	671 vaddw.u8 q14, q14, d6

	672 vaddw.u8 q15, q15, d7

	673

	674 ; clip_pixel

	675 vqmovun.s16 d0, q8

	676 vqmovun.s16 d1, q9

	677 vqmovun.s16 d2, q10

	678 vqmovun.s16 d3, q11

	679 vqmovun.s16 d4, q12

	680 vqmovun.s16 d5, q13

	681 vqmovun.s16 d6, q14

	682 vqmovun.s16 d7, q15

	683

	684 ; store the data

	685 vst1.64 {d0}, [r0], r2

	686 vst1.64 {d1}, [r0], r2

	687 vst1.64 {d2}, [r0], r2

	688 vst1.64 {d3}, [r0], r2

	689 vst1.64 {d4}, [r0], r2

	690 vst1.64 {d5}, [r0], r2

	691 vst1.64 {d6}, [r0], r2

	692 vst1.64 {d7}, [r0], r2

	693 bx lr

	694 ENDP ; \|vp9_iht8x8_64_add_neon\|

	695

	696 END

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm ('k') | source/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm » ('j') | no next file with comments »