source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm - Issue 812033011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c ('k') | source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 ;

	2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 ;

	4 ; Use of this source code is governed by a BSD-style license

	5 ; that can be found in the LICENSE file in the root of the source

	6 ; tree. An additional intellectual property rights grant can be found

	7 ; in the file PATENTS. All contributing project authors may

	8 ; be found in the AUTHORS file in the root of the source tree.

	9 ;

	10

	11 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster

	12 ; dct_const_round_shift(a * b) within butterfly calculations.

	13 cospi_1_64 EQU 16364

	14 cospi_2_64 EQU 16305

	15 cospi_3_64 EQU 16207

	16 cospi_4_64 EQU 16069

	17 cospi_5_64 EQU 15893

	18 cospi_6_64 EQU 15679

	19 cospi_7_64 EQU 15426

	20 cospi_8_64 EQU 15137

	21 cospi_9_64 EQU 14811

	22 cospi_10_64 EQU 14449

	23 cospi_11_64 EQU 14053

	24 cospi_12_64 EQU 13623

	25 cospi_13_64 EQU 13160

	26 cospi_14_64 EQU 12665

	27 cospi_15_64 EQU 12140

	28 cospi_16_64 EQU 11585

	29 cospi_17_64 EQU 11003

	30 cospi_18_64 EQU 10394

	31 cospi_19_64 EQU 9760

	32 cospi_20_64 EQU 9102

	33 cospi_21_64 EQU 8423

	34 cospi_22_64 EQU 7723

	35 cospi_23_64 EQU 7005

	36 cospi_24_64 EQU 6270

	37 cospi_25_64 EQU 5520

	38 cospi_26_64 EQU 4756

	39 cospi_27_64 EQU 3981

	40 cospi_28_64 EQU 3196

	41 cospi_29_64 EQU 2404

	42 cospi_30_64 EQU 1606

	43 cospi_31_64 EQU 804

	44

	45

	46 EXPORT \|vp9_idct32x32_1024_add_neon\|

	47 ARM

	48 REQUIRE8

	49 PRESERVE8

	50

	51 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

	52

	53 AREA Block, CODE, READONLY

	54

	55 ; --------------------------------------------------------------------------

	56 ; Load from transposed_buffer

	57 ; q13 = transposed_buffer[first_offset]

	58 ; q14 = transposed_buffer[second_offset]

	59 ; for proper address calculation, the last offset used when manipulating

	60 ; transposed_buffer must be passed in. use 0 for first use.

	61 MACRO

	62 LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset

	63 ; address calculation with proper stride and loading

	64 add r0, #($first_offset - $prev_offset )82

	65 vld1.s16 {q14}, [r0]

	66 add r0, #($second_offset - $first_offset)82

	67 vld1.s16 {q13}, [r0]

	68 ; (used) two registers (q14, q13)

	69 MEND

	70 ; --------------------------------------------------------------------------

	71 ; Load from output (used as temporary storage)

	72 ; reg1 = output[first_offset]

	73 ; reg2 = output[second_offset]

	74 ; for proper address calculation, the last offset used when manipulating

	75 ; output, whether reading or storing) must be passed in. use 0 for first

	76 ; use.

	77 MACRO

	78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

	79 ; address calculation with proper stride and loading

	80 add r1, #($first_offset - $prev_offset )322

	81 vld1.s16 {$reg1}, [r1]

	82 add r1, #($second_offset - $first_offset)322

	83 vld1.s16 {$reg2}, [r1]

	84 ; (used) two registers ($reg1, $reg2)

	85 MEND

	86 ; --------------------------------------------------------------------------

	87 ; Store into output (sometimes as as temporary storage)

	88 ; output[first_offset] = reg1

	89 ; output[second_offset] = reg2

	90 ; for proper address calculation, the last offset used when manipulating

	91 ; output, whether reading or storing) must be passed in. use 0 for first

	92 ; use.

	93 MACRO

	94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

	95 ; address calculation with proper stride and storing

	96 add r1, #($first_offset - $prev_offset )322

	97 vst1.16 {$reg1}, [r1]

	98 add r1, #($second_offset - $first_offset)322

	99 vst1.16 {$reg2}, [r1]

	100 MEND

	101 ; --------------------------------------------------------------------------

	102 ; Combine-add results with current destination content

	103 ; q6-q9 contain the results (out[j * 32 + 0-31])

	104 MACRO

	105 STORE_COMBINE_CENTER_RESULTS

	106 ; load dest[j * dest_stride + 0-31]

	107 vld1.s16 {d8}, [r10], r2

	108 vld1.s16 {d11}, [r9], r11

	109 vld1.s16 {d9}, [r10]

	110 vld1.s16 {d10}, [r9]

	111 ; ROUND_POWER_OF_TWO

	112 vrshr.s16 q7, q7, #6

	113 vrshr.s16 q8, q8, #6

	114 vrshr.s16 q9, q9, #6

	115 vrshr.s16 q6, q6, #6

	116 ; add to dest[j * dest_stride + 0-31]

	117 vaddw.u8 q7, q7, d9

	118 vaddw.u8 q8, q8, d10

	119 vaddw.u8 q9, q9, d11

	120 vaddw.u8 q6, q6, d8

	121 ; clip pixel

	122 vqmovun.s16 d9, q7

	123 vqmovun.s16 d10, q8

	124 vqmovun.s16 d11, q9

	125 vqmovun.s16 d8, q6

	126 ; store back into dest[j * dest_stride + 0-31]

	127 vst1.16 {d9}, [r10], r11

	128 vst1.16 {d10}, [r9], r2

	129 vst1.16 {d8}, [r10]

	130 vst1.16 {d11}, [r9]

	131 ; update pointers (by dest_stride * 2)

	132 sub r9, r9, r2, lsl #1

	133 add r10, r10, r2, lsl #1

	134 MEND

	135 ; --------------------------------------------------------------------------

	136 ; Combine-add results with current destination content

	137 ; q6-q9 contain the results (out[j * 32 + 0-31])

	138 MACRO

	139 STORE_COMBINE_CENTER_RESULTS_LAST

	140 ; load dest[j * dest_stride + 0-31]

	141 vld1.s16 {d8}, [r10], r2

	142 vld1.s16 {d11}, [r9], r11

	143 vld1.s16 {d9}, [r10]

	144 vld1.s16 {d10}, [r9]

	145 ; ROUND_POWER_OF_TWO

	146 vrshr.s16 q7, q7, #6

	147 vrshr.s16 q8, q8, #6

	148 vrshr.s16 q9, q9, #6

	149 vrshr.s16 q6, q6, #6

	150 ; add to dest[j * dest_stride + 0-31]

	151 vaddw.u8 q7, q7, d9

	152 vaddw.u8 q8, q8, d10

	153 vaddw.u8 q9, q9, d11

	154 vaddw.u8 q6, q6, d8

	155 ; clip pixel

	156 vqmovun.s16 d9, q7

	157 vqmovun.s16 d10, q8

	158 vqmovun.s16 d11, q9

	159 vqmovun.s16 d8, q6

	160 ; store back into dest[j * dest_stride + 0-31]

	161 vst1.16 {d9}, [r10], r11

	162 vst1.16 {d10}, [r9], r2

	163 vst1.16 {d8}, [r10]!

	164 vst1.16 {d11}, [r9]!

	165 ; update pointers (by dest_stride * 2)

	166 sub r9, r9, r2, lsl #1

	167 add r10, r10, r2, lsl #1

	168 MEND

	169 ; --------------------------------------------------------------------------

	170 ; Combine-add results with current destination content

	171 ; q4-q7 contain the results (out[j * 32 + 0-31])

	172 MACRO

	173 STORE_COMBINE_EXTREME_RESULTS

	174 ; load dest[j * dest_stride + 0-31]

	175 vld1.s16 {d4}, [r7], r2

	176 vld1.s16 {d7}, [r6], r11

	177 vld1.s16 {d5}, [r7]

	178 vld1.s16 {d6}, [r6]

	179 ; ROUND_POWER_OF_TWO

	180 vrshr.s16 q5, q5, #6

	181 vrshr.s16 q6, q6, #6

	182 vrshr.s16 q7, q7, #6

	183 vrshr.s16 q4, q4, #6

	184 ; add to dest[j * dest_stride + 0-31]

	185 vaddw.u8 q5, q5, d5

	186 vaddw.u8 q6, q6, d6

	187 vaddw.u8 q7, q7, d7

	188 vaddw.u8 q4, q4, d4

	189 ; clip pixel

	190 vqmovun.s16 d5, q5

	191 vqmovun.s16 d6, q6

	192 vqmovun.s16 d7, q7

	193 vqmovun.s16 d4, q4

	194 ; store back into dest[j * dest_stride + 0-31]

	195 vst1.16 {d5}, [r7], r11

	196 vst1.16 {d6}, [r6], r2

	197 vst1.16 {d7}, [r6]

	198 vst1.16 {d4}, [r7]

	199 ; update pointers (by dest_stride * 2)

	200 sub r6, r6, r2, lsl #1

	201 add r7, r7, r2, lsl #1

	202 MEND

	203 ; --------------------------------------------------------------------------

	204 ; Combine-add results with current destination content

	205 ; q4-q7 contain the results (out[j * 32 + 0-31])

	206 MACRO

	207 STORE_COMBINE_EXTREME_RESULTS_LAST

	208 ; load dest[j * dest_stride + 0-31]

	209 vld1.s16 {d4}, [r7], r2

	210 vld1.s16 {d7}, [r6], r11

	211 vld1.s16 {d5}, [r7]

	212 vld1.s16 {d6}, [r6]

	213 ; ROUND_POWER_OF_TWO

	214 vrshr.s16 q5, q5, #6

	215 vrshr.s16 q6, q6, #6

	216 vrshr.s16 q7, q7, #6

	217 vrshr.s16 q4, q4, #6

	218 ; add to dest[j * dest_stride + 0-31]

	219 vaddw.u8 q5, q5, d5

	220 vaddw.u8 q6, q6, d6

	221 vaddw.u8 q7, q7, d7

	222 vaddw.u8 q4, q4, d4

	223 ; clip pixel

	224 vqmovun.s16 d5, q5

	225 vqmovun.s16 d6, q6

	226 vqmovun.s16 d7, q7

	227 vqmovun.s16 d4, q4

	228 ; store back into dest[j * dest_stride + 0-31]

	229 vst1.16 {d5}, [r7], r11

	230 vst1.16 {d6}, [r6], r2

	231 vst1.16 {d7}, [r6]!

	232 vst1.16 {d4}, [r7]!

	233 ; update pointers (by dest_stride * 2)

	234 sub r6, r6, r2, lsl #1

	235 add r7, r7, r2, lsl #1

	236 MEND

	237 ; --------------------------------------------------------------------------

	238 ; Touches q8-q12, q15 (q13-q14 are preserved)

	239 ; valid output registers are anything but q8-q11

	240 MACRO

	241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

	242 ; TODO(cd): have special case to re-use constants when they are similar for

	243 ; consecutive butterflies

	244 ; TODO(cd): have special case when both constants are the same, do the

	245 ; additions/subtractions before the multiplies.

	246 ; generate the constants

	247 ; generate scalar constants

	248 mov r8, #$first_constant & 0xFF00

	249 mov r12, #$second_constant & 0xFF00

	250 add r8, #$first_constant & 0x00FF

	251 add r12, #$second_constant & 0x00FF

	252 ; generate vector constants

	253 vdup.16 d30, r8

	254 vdup.16 d31, r12

	255 ; (used) two for inputs (regA-regD), one for constants (q15)

	256 ; do some multiplications (ordered for maximum latency hiding)

	257 vmull.s16 q8, $regC, d30

	258 vmull.s16 q10, $regA, d31

	259 vmull.s16 q9, $regD, d30

	260 vmull.s16 q11, $regB, d31

	261 vmull.s16 q12, $regC, d31

	262 ; (used) five for intermediate (q8-q12), one for constants (q15)

	263 ; do some addition/subtractions (to get back two register)

	264 vsub.s32 q8, q8, q10

	265 vsub.s32 q9, q9, q11

	266 ; do more multiplications (ordered for maximum latency hiding)

	267 vmull.s16 q10, $regD, d31

	268 vmull.s16 q11, $regA, d30

	269 vmull.s16 q15, $regB, d30

	270 ; (used) six for intermediate (q8-q12, q15)

	271 ; do more addition/subtractions

	272 vadd.s32 q11, q12, q11

	273 vadd.s32 q10, q10, q15

	274 ; (used) four for intermediate (q8-q11)

	275 ; dct_const_round_shift

	276 vqrshrn.s32 $reg1, q8, #14

	277 vqrshrn.s32 $reg2, q9, #14

	278 vqrshrn.s32 $reg3, q11, #14

	279 vqrshrn.s32 $reg4, q10, #14

	280 ; (used) two for results, well four d registers

	281 MEND

	282 ; --------------------------------------------------------------------------

	283 ; Touches q8-q12, q15 (q13-q14 are preserved)

	284 ; valid output registers are anything but q8-q11

	285 MACRO

	286 DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $re g4

	287 DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $ reg2, $reg3, $reg4

	288 MEND

	289 ; --------------------------------------------------------------------------

	290

	291 ;void vp9_idct32x32_1024_add_neon(int16_t input, uint8_t dest, int dest_stride );

	292 ;

	293 ; r0 int16_t *input,

	294 ; r1 uint8_t *dest,

	295 ; r2 int dest_stride)

	296 ; loop counters

	297 ; r4 bands loop counter

	298 ; r5 pass loop counter

	299 ; r8 transpose loop counter

	300 ; combine-add pointers

	301 ; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)

	302 ; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)

	303 ; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)

	304 ; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)

	305

	306 \|vp9_idct32x32_1024_add_neon\| PROC

	307 ; This function does one pass of idct32x32 transform.

	308 ;

	309 ; This is done by transposing the input and then doing a 1d transform on

	310 ; columns. In the first pass, the transposed columns are the original

	311 ; rows. In the second pass, after the transposition, the colums are the

	312 ; original columns.

	313 ; The 1d transform is done by looping over bands of eight columns (the

	314 ; idct32_bands loop). For each band, the transform input transposition

	315 ; is done on demand, one band of four 8x8 matrices at a time. The four

	316 ; matrices are transposed by pairs (the idct32_transpose_pair loop).

	317 push {r4-r11}

	318 vpush {d8-d15}

	319 ; stack operation

	320 ; internal buffer used to transpose 8 lines into before transforming them

	321 ; int16_t transpose_buffer[32 * 8];

	322 ; at sp + [4096, 4607]

	323 ; results of the first pass (transpose and transform rows)

	324 ; int16_t pass1[32 * 32];

	325 ; at sp + [0, 2047]

	326 ; results of the second pass (transpose and transform columns)

	327 ; int16_t pass2[32 * 32];

	328 ; at sp + [2048, 4095]

	329 sub sp, sp, #512+2048+2048

	330

	331 ; r6 = dest + 31 * dest_stride

	332 ; r7 = dest + 0 * dest_stride

	333 ; r9 = dest + 15 * dest_stride

	334 ; r10 = dest + 16 * dest_stride

	335 rsb r6, r2, r2, lsl #5

	336 rsb r9, r2, r2, lsl #4

	337 add r10, r1, r2, lsl #4

	338 mov r7, r1

	339 add r6, r6, r1

	340 add r9, r9, r1

	341 ; r11 = -dest_stride

	342 neg r11, r2

	343 ; r3 = input

	344 mov r3, r0

	345 ; parameters for first pass

	346 ; r0 = transpose_buffer[32 * 8]

	347 add r0, sp, #4096

	348 ; r1 = pass1[32 * 32]

	349 mov r1, sp

	350

	351 mov r5, #0 ; initialize pass loop counter

	352 idct32_pass_loop

	353 mov r4, #4 ; initialize bands loop counter

	354 idct32_bands_loop

	355 mov r8, #2 ; initialize transpose loop counter

	356 idct32_transpose_pair_loop

	357 ; Load two horizontally consecutive 8x8 16bit data matrices. The first one

	358 ; into q0-q7 and the second one into q8-q15. There is a stride of 64,

	359 ; adjusted to 32 because of the two post-increments.

	360 vld1.s16 {q8}, [r3]!

	361 vld1.s16 {q0}, [r3]!

	362 add r3, #32

	363 vld1.s16 {q9}, [r3]!

	364 vld1.s16 {q1}, [r3]!

	365 add r3, #32

	366 vld1.s16 {q10}, [r3]!

	367 vld1.s16 {q2}, [r3]!

	368 add r3, #32

	369 vld1.s16 {q11}, [r3]!

	370 vld1.s16 {q3}, [r3]!

	371 add r3, #32

	372 vld1.s16 {q12}, [r3]!

	373 vld1.s16 {q4}, [r3]!

	374 add r3, #32

	375 vld1.s16 {q13}, [r3]!

	376 vld1.s16 {q5}, [r3]!

	377 add r3, #32

	378 vld1.s16 {q14}, [r3]!

	379 vld1.s16 {q6}, [r3]!

	380 add r3, #32

	381 vld1.s16 {q15}, [r3]!

	382 vld1.s16 {q7}, [r3]!

	383

	384 ; Transpose the two 8x8 16bit data matrices.

	385 vswp d17, d24

	386 vswp d23, d30

	387 vswp d21, d28

	388 vswp d19, d26

	389 vswp d1, d8

	390 vswp d7, d14

	391 vswp d5, d12

	392 vswp d3, d10

	393 vtrn.32 q8, q10

	394 vtrn.32 q9, q11

	395 vtrn.32 q12, q14

	396 vtrn.32 q13, q15

	397 vtrn.32 q0, q2

	398 vtrn.32 q1, q3

	399 vtrn.32 q4, q6

	400 vtrn.32 q5, q7

	401 vtrn.16 q8, q9

	402 vtrn.16 q10, q11

	403 vtrn.16 q12, q13

	404 vtrn.16 q14, q15

	405 vtrn.16 q0, q1

	406 vtrn.16 q2, q3

	407 vtrn.16 q4, q5

	408 vtrn.16 q6, q7

	409

	410 ; Store both matrices after each other. There is a stride of 32, which

	411 ; adjusts to nothing because of the post-increments.

	412 vst1.16 {q8}, [r0]!

	413 vst1.16 {q9}, [r0]!

	414 vst1.16 {q10}, [r0]!

	415 vst1.16 {q11}, [r0]!

	416 vst1.16 {q12}, [r0]!

	417 vst1.16 {q13}, [r0]!

	418 vst1.16 {q14}, [r0]!

	419 vst1.16 {q15}, [r0]!

	420 vst1.16 {q0}, [r0]!

	421 vst1.16 {q1}, [r0]!

	422 vst1.16 {q2}, [r0]!

	423 vst1.16 {q3}, [r0]!

	424 vst1.16 {q4}, [r0]!

	425 vst1.16 {q5}, [r0]!

	426 vst1.16 {q6}, [r0]!

	427 vst1.16 {q7}, [r0]!

	428

	429 ; increment pointers by adjusted stride (not necessary for r0/out)

	430 ; go back by 7*32 for the seven lines moved fully by read and add

	431 ; go back by 32 for the eigth line only read

	432 ; advance by 16*2 to go the next pair

	433 sub r3, r3, #7322 + 32 - 16*2

	434 ; transpose pair loop processing

	435 subs r8, r8, #1

	436 bne idct32_transpose_pair_loop

	437

	438 ; restore r0/input to its original value

	439 sub r0, r0, #3282

	440

	441 ; Instead of doing the transforms stage by stage, it is done by loading

	442 ; some input values and doing as many stages as possible to minimize the

	443 ; storing/loading of intermediate results. To fit within registers, the

	444 ; final coefficients are cut into four blocks:

	445 ; BLOCK A: 16-19,28-31

	446 ; BLOCK B: 20-23,24-27

	447 ; BLOCK C: 8-10,11-15

	448 ; BLOCK D: 0-3,4-7

	449 ; Blocks A and C are straight calculation through the various stages. In

	450 ; block B, further calculations are performed using the results from

	451 ; block A. In block D, further calculations are performed using the results

	452 ; from block C and then the final calculations are done using results from

	453 ; block A and B which have been combined at the end of block B.

	454

	455 ; --------------------------------------------------------------------------

	456 ; BLOCK A: 16-19,28-31

	457 ; --------------------------------------------------------------------------

	458 ; generate 16,17,30,31

	459 ; --------------------------------------------------------------------------

	460 ; part of stage 1

	461 ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;

	462 ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;

	463 ;step1b[16][i] = dct_const_round_shift(temp1);

	464 ;step1b[31][i] = dct_const_round_shift(temp2);

	465 LOAD_FROM_TRANSPOSED 0, 1, 31

	466 DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5

	467 ; --------------------------------------------------------------------------

	468 ; part of stage 1

	469 ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;

	470 ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;

	471 ;step1b[17][i] = dct_const_round_shift(temp1);

	472 ;step1b[30][i] = dct_const_round_shift(temp2);

	473 LOAD_FROM_TRANSPOSED 31, 17, 15

	474 DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7

	475 ; --------------------------------------------------------------------------

	476 ; part of stage 2

	477 ;step2[16] = step1b[16][i] + step1b[17][i];

	478 ;step2[17] = step1b[16][i] - step1b[17][i];

	479 ;step2[30] = -step1b[30][i] + step1b[31][i];

	480 ;step2[31] = step1b[30][i] + step1b[31][i];

	481 vadd.s16 q4, q0, q1

	482 vsub.s16 q13, q0, q1

	483 vadd.s16 q6, q2, q3

	484 vsub.s16 q14, q2, q3

	485 ; --------------------------------------------------------------------------

	486 ; part of stage 3

	487 ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;

	488 ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;

	489 ;step3[17] = dct_const_round_shift(temp1);

	490 ;step3[30] = dct_const_round_shift(temp2);

	491 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15

	492 ; --------------------------------------------------------------------------

	493 ; generate 18,19,28,29

	494 ; --------------------------------------------------------------------------

	495 ; part of stage 1

	496 ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;

	497 ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;

	498 ;step1b[18][i] = dct_const_round_shift(temp1);

	499 ;step1b[29][i] = dct_const_round_shift(temp2);

	500 LOAD_FROM_TRANSPOSED 15, 9, 23

	501 DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5

	502 ; --------------------------------------------------------------------------

	503 ; part of stage 1

	504 ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;

	505 ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;

	506 ;step1b[19][i] = dct_const_round_shift(temp1);

	507 ;step1b[28][i] = dct_const_round_shift(temp2);

	508 LOAD_FROM_TRANSPOSED 23, 25, 7

	509 DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7

	510 ; --------------------------------------------------------------------------

	511 ; part of stage 2

	512 ;step2[18] = -step1b[18][i] + step1b[19][i];

	513 ;step2[19] = step1b[18][i] + step1b[19][i];

	514 ;step2[28] = step1b[28][i] + step1b[29][i];

	515 ;step2[29] = step1b[28][i] - step1b[29][i];

	516 vsub.s16 q13, q3, q2

	517 vadd.s16 q3, q3, q2

	518 vsub.s16 q14, q1, q0

	519 vadd.s16 q2, q1, q0

	520 ; --------------------------------------------------------------------------

	521 ; part of stage 3

	522 ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);

	523 ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);

	524 ;step3[29] = dct_const_round_shift(temp1);

	525 ;step3[18] = dct_const_round_shift(temp2);

	526 DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1

	527 ; --------------------------------------------------------------------------

	528 ; combine 16-19,28-31

	529 ; --------------------------------------------------------------------------

	530 ; part of stage 4

	531 ;step1[16] = step1b[16][i] + step1b[19][i];

	532 ;step1[17] = step1b[17][i] + step1b[18][i];

	533 ;step1[18] = step1b[17][i] - step1b[18][i];

	534 ;step1[29] = step1b[30][i] - step1b[29][i];

	535 ;step1[30] = step1b[30][i] + step1b[29][i];

	536 ;step1[31] = step1b[31][i] + step1b[28][i];

	537 vadd.s16 q8, q4, q2

	538 vadd.s16 q9, q5, q0

	539 vadd.s16 q10, q7, q1

	540 vadd.s16 q15, q6, q3

	541 vsub.s16 q13, q5, q0

	542 vsub.s16 q14, q7, q1

	543 STORE_IN_OUTPUT 0, 16, 31, q8, q15

	544 STORE_IN_OUTPUT 31, 17, 30, q9, q10

	545 ; --------------------------------------------------------------------------

	546 ; part of stage 5

	547 ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;

	548 ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;

	549 ;step2[18] = dct_const_round_shift(temp1);

	550 ;step2[29] = dct_const_round_shift(temp2);

	551 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3

	552 STORE_IN_OUTPUT 30, 29, 18, q1, q0

	553 ; --------------------------------------------------------------------------

	554 ; part of stage 4

	555 ;step1[19] = step1b[16][i] - step1b[19][i];

	556 ;step1[28] = step1b[31][i] - step1b[28][i];

	557 vsub.s16 q13, q4, q2

	558 vsub.s16 q14, q6, q3

	559 ; --------------------------------------------------------------------------

	560 ; part of stage 5

	561 ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;

	562 ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;

	563 ;step2[19] = dct_const_round_shift(temp1);

	564 ;step2[28] = dct_const_round_shift(temp2);

	565 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13

	566 STORE_IN_OUTPUT 18, 19, 28, q4, q6

	567 ; --------------------------------------------------------------------------

	568

	569

	570 ; --------------------------------------------------------------------------

	571 ; BLOCK B: 20-23,24-27

	572 ; --------------------------------------------------------------------------

	573 ; generate 20,21,26,27

	574 ; --------------------------------------------------------------------------

	575 ; part of stage 1

	576 ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;

	577 ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;

	578 ;step1b[20][i] = dct_const_round_shift(temp1);

	579 ;step1b[27][i] = dct_const_round_shift(temp2);

	580 LOAD_FROM_TRANSPOSED 7, 5, 27

	581 DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5

	582 ; --------------------------------------------------------------------------

	583 ; part of stage 1

	584 ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;

	585 ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;

	586 ;step1b[21][i] = dct_const_round_shift(temp1);

	587 ;step1b[26][i] = dct_const_round_shift(temp2);

	588 LOAD_FROM_TRANSPOSED 27, 21, 11

	589 DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7

	590 ; --------------------------------------------------------------------------

	591 ; part of stage 2

	592 ;step2[20] = step1b[20][i] + step1b[21][i];

	593 ;step2[21] = step1b[20][i] - step1b[21][i];

	594 ;step2[26] = -step1b[26][i] + step1b[27][i];

	595 ;step2[27] = step1b[26][i] + step1b[27][i];

	596 vsub.s16 q13, q0, q1

	597 vadd.s16 q0, q0, q1

	598 vsub.s16 q14, q2, q3

	599 vadd.s16 q2, q2, q3

	600 ; --------------------------------------------------------------------------

	601 ; part of stage 3

	602 ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;

	603 ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;

	604 ;step3[21] = dct_const_round_shift(temp1);

	605 ;step3[26] = dct_const_round_shift(temp2);

	606 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7

	607 ; --------------------------------------------------------------------------

	608 ; generate 22,23,24,25

	609 ; --------------------------------------------------------------------------

	610 ; part of stage 1

	611 ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;

	612 ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;

	613 ;step1b[22][i] = dct_const_round_shift(temp1);

	614 ;step1b[25][i] = dct_const_round_shift(temp2);

	615 LOAD_FROM_TRANSPOSED 11, 13, 19

	616 DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15

	617 ; --------------------------------------------------------------------------

	618 ; part of stage 1

	619 ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;

	620 ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;

	621 ;step1b[23][i] = dct_const_round_shift(temp1);

	622 ;step1b[24][i] = dct_const_round_shift(temp2);

	623 LOAD_FROM_TRANSPOSED 19, 29, 3

	624 DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13

	625 ; --------------------------------------------------------------------------

	626 ; part of stage 2

	627 ;step2[22] = -step1b[22][i] + step1b[23][i];

	628 ;step2[23] = step1b[22][i] + step1b[23][i];

	629 ;step2[24] = step1b[24][i] + step1b[25][i];

	630 ;step2[25] = step1b[24][i] - step1b[25][i];

	631 vsub.s16 q14, q4, q5

	632 vadd.s16 q5, q4, q5

	633 vsub.s16 q13, q6, q7

	634 vadd.s16 q6, q6, q7

	635 ; --------------------------------------------------------------------------

	636 ; part of stage 3

	637 ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);

	638 ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);

	639 ;step3[25] = dct_const_round_shift(temp1);

	640 ;step3[22] = dct_const_round_shift(temp2);

	641 DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15

	642 ; --------------------------------------------------------------------------

	643 ; combine 20-23,24-27

	644 ; --------------------------------------------------------------------------

	645 ; part of stage 4

	646 ;step1[22] = step1b[22][i] + step1b[21][i];

	647 ;step1[23] = step1b[23][i] + step1b[20][i];

	648 vadd.s16 q10, q7, q1

	649 vadd.s16 q11, q5, q0

	650 ;step1[24] = step1b[24][i] + step1b[27][i];

	651 ;step1[25] = step1b[25][i] + step1b[26][i];

	652 vadd.s16 q12, q6, q2

	653 vadd.s16 q15, q4, q3

	654 ; --------------------------------------------------------------------------

	655 ; part of stage 6

	656 ;step3[16] = step1b[16][i] + step1b[23][i];

	657 ;step3[17] = step1b[17][i] + step1b[22][i];

	658 ;step3[22] = step1b[17][i] - step1b[22][i];

	659 ;step3[23] = step1b[16][i] - step1b[23][i];

	660 LOAD_FROM_OUTPUT 28, 16, 17, q14, q13

	661 vadd.s16 q8, q14, q11

	662 vadd.s16 q9, q13, q10

	663 vsub.s16 q13, q13, q10

	664 vsub.s16 q11, q14, q11

	665 STORE_IN_OUTPUT 17, 17, 16, q9, q8

	666 ; --------------------------------------------------------------------------

	667 ; part of stage 6

	668 ;step3[24] = step1b[31][i] - step1b[24][i];

	669 ;step3[25] = step1b[30][i] - step1b[25][i];

	670 ;step3[30] = step1b[30][i] + step1b[25][i];

	671 ;step3[31] = step1b[31][i] + step1b[24][i];

	672 LOAD_FROM_OUTPUT 16, 30, 31, q14, q9

	673 vsub.s16 q8, q9, q12

	674 vadd.s16 q10, q14, q15

	675 vsub.s16 q14, q14, q15

	676 vadd.s16 q12, q9, q12

	677 STORE_IN_OUTPUT 31, 30, 31, q10, q12

	678 ; --------------------------------------------------------------------------

	679 ; TODO(cd) do some register allocation change to remove these push/pop

	680 vpush {q8} ; [24]

	681 vpush {q11} ; [23]

	682 ; --------------------------------------------------------------------------

	683 ; part of stage 7

	684 ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;

	685 ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;

	686 ;step1[22] = dct_const_round_shift(temp1);

	687 ;step1[25] = dct_const_round_shift(temp2);

	688 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

	689 STORE_IN_OUTPUT 31, 25, 22, q14, q13

	690 ; --------------------------------------------------------------------------

	691 ; part of stage 7

	692 ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;

	693 ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;

	694 ;step1[23] = dct_const_round_shift(temp1);

	695 ;step1[24] = dct_const_round_shift(temp2);

	696 ; TODO(cd) do some register allocation change to remove these push/pop

	697 vpop {q13} ; [23]

	698 vpop {q14} ; [24]

	699 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

	700 STORE_IN_OUTPUT 22, 24, 23, q14, q13

	701 ; --------------------------------------------------------------------------

	702 ; part of stage 4

	703 ;step1[20] = step1b[23][i] - step1b[20][i];

	704 ;step1[27] = step1b[24][i] - step1b[27][i];

	705 vsub.s16 q14, q5, q0

	706 vsub.s16 q13, q6, q2

	707 ; --------------------------------------------------------------------------

	708 ; part of stage 5

	709 ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);

	710 ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);

	711 ;step2[27] = dct_const_round_shift(temp1);

	712 ;step2[20] = dct_const_round_shift(temp2);

	713 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13

	714 ; --------------------------------------------------------------------------

	715 ; part of stage 4

	716 ;step1[21] = step1b[22][i] - step1b[21][i];

	717 ;step1[26] = step1b[25][i] - step1b[26][i];

	718 vsub.s16 q14, q7, q1

	719 vsub.s16 q13, q4, q3

	720 ; --------------------------------------------------------------------------

	721 ; part of stage 5

	722 ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);

	723 ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);

	724 ;step2[26] = dct_const_round_shift(temp1);

	725 ;step2[21] = dct_const_round_shift(temp2);

	726 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3

	727 ; --------------------------------------------------------------------------

	728 ; part of stage 6

	729 ;step3[18] = step1b[18][i] + step1b[21][i];

	730 ;step3[19] = step1b[19][i] + step1b[20][i];

	731 ;step3[20] = step1b[19][i] - step1b[20][i];

	732 ;step3[21] = step1b[18][i] - step1b[21][i];

	733 LOAD_FROM_OUTPUT 23, 18, 19, q14, q13

	734 vadd.s16 q8, q14, q1

	735 vadd.s16 q9, q13, q6

	736 vsub.s16 q13, q13, q6

	737 vsub.s16 q1, q14, q1

	738 STORE_IN_OUTPUT 19, 18, 19, q8, q9

	739 ; --------------------------------------------------------------------------

	740 ; part of stage 6

	741 ;step3[27] = step1b[28][i] - step1b[27][i];

	742 ;step3[28] = step1b[28][i] + step1b[27][i];

	743 ;step3[29] = step1b[29][i] + step1b[26][i];

	744 ;step3[26] = step1b[29][i] - step1b[26][i];

	745 LOAD_FROM_OUTPUT 19, 28, 29, q8, q9

	746 vsub.s16 q14, q8, q5

	747 vadd.s16 q10, q8, q5

	748 vadd.s16 q11, q9, q0

	749 vsub.s16 q0, q9, q0

	750 STORE_IN_OUTPUT 29, 28, 29, q10, q11

	751 ; --------------------------------------------------------------------------

	752 ; part of stage 7

	753 ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;

	754 ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;

	755 ;step1[20] = dct_const_round_shift(temp1);

	756 ;step1[27] = dct_const_round_shift(temp2);

	757 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

	758 STORE_IN_OUTPUT 29, 20, 27, q13, q14

	759 ; --------------------------------------------------------------------------

	760 ; part of stage 7

	761 ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;

	762 ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;

	763 ;step1[21] = dct_const_round_shift(temp1);

	764 ;step1[26] = dct_const_round_shift(temp2);

	765 DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1

	766 STORE_IN_OUTPUT 27, 21, 26, q1, q0

	767 ; --------------------------------------------------------------------------

	768

	769

	770 ; --------------------------------------------------------------------------

	771 ; BLOCK C: 8-10,11-15

	772 ; --------------------------------------------------------------------------

	773 ; generate 8,9,14,15

	774 ; --------------------------------------------------------------------------

	775 ; part of stage 2

	776 ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;

	777 ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;

	778 ;step2[8] = dct_const_round_shift(temp1);

	779 ;step2[15] = dct_const_round_shift(temp2);

	780 LOAD_FROM_TRANSPOSED 3, 2, 30

	781 DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5

	782 ; --------------------------------------------------------------------------

	783 ; part of stage 2

	784 ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;

	785 ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;

	786 ;step2[9] = dct_const_round_shift(temp1);

	787 ;step2[14] = dct_const_round_shift(temp2);

	788 LOAD_FROM_TRANSPOSED 30, 18, 14

	789 DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7

	790 ; --------------------------------------------------------------------------

	791 ; part of stage 3

	792 ;step3[8] = step1b[8][i] + step1b[9][i];

	793 ;step3[9] = step1b[8][i] - step1b[9][i];

	794 ;step3[14] = step1b[15][i] - step1b[14][i];

	795 ;step3[15] = step1b[15][i] + step1b[14][i];

	796 vsub.s16 q13, q0, q1

	797 vadd.s16 q0, q0, q1

	798 vsub.s16 q14, q2, q3

	799 vadd.s16 q2, q2, q3

	800 ; --------------------------------------------------------------------------

	801 ; part of stage 4

	802 ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;

	803 ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;

	804 ;step1[9] = dct_const_round_shift(temp1);

	805 ;step1[14] = dct_const_round_shift(temp2);

	806 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7

	807 ; --------------------------------------------------------------------------

	808 ; generate 10,11,12,13

	809 ; --------------------------------------------------------------------------

	810 ; part of stage 2

	811 ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;

	812 ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;

	813 ;step2[10] = dct_const_round_shift(temp1);

	814 ;step2[13] = dct_const_round_shift(temp2);

	815 LOAD_FROM_TRANSPOSED 14, 10, 22

	816 DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15

	817 ; --------------------------------------------------------------------------

	818 ; part of stage 2

	819 ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;

	820 ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;

	821 ;step2[11] = dct_const_round_shift(temp1);

	822 ;step2[12] = dct_const_round_shift(temp2);

	823 LOAD_FROM_TRANSPOSED 22, 26, 6

	824 DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13

	825 ; --------------------------------------------------------------------------

	826 ; part of stage 3

	827 ;step3[10] = step1b[11][i] - step1b[10][i];

	828 ;step3[11] = step1b[11][i] + step1b[10][i];

	829 ;step3[12] = step1b[12][i] + step1b[13][i];

	830 ;step3[13] = step1b[12][i] - step1b[13][i];

	831 vsub.s16 q14, q4, q5

	832 vadd.s16 q5, q4, q5

	833 vsub.s16 q13, q6, q7

	834 vadd.s16 q6, q6, q7

	835 ; --------------------------------------------------------------------------

	836 ; part of stage 4

	837 ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);

	838 ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);

	839 ;step1[13] = dct_const_round_shift(temp1);

	840 ;step1[10] = dct_const_round_shift(temp2);

	841 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15

	842 ; --------------------------------------------------------------------------

	843 ; combine 8-10,11-15

	844 ; --------------------------------------------------------------------------

	845 ; part of stage 5

	846 ;step2[8] = step1b[8][i] + step1b[11][i];

	847 ;step2[9] = step1b[9][i] + step1b[10][i];

	848 ;step2[10] = step1b[9][i] - step1b[10][i];

	849 vadd.s16 q8, q0, q5

	850 vadd.s16 q9, q1, q7

	851 vsub.s16 q13, q1, q7

	852 ;step2[13] = step1b[14][i] - step1b[13][i];

	853 ;step2[14] = step1b[14][i] + step1b[13][i];

	854 ;step2[15] = step1b[15][i] + step1b[12][i];

	855 vsub.s16 q14, q3, q4

	856 vadd.s16 q10, q3, q4

	857 vadd.s16 q15, q2, q6

	858 STORE_IN_OUTPUT 26, 8, 15, q8, q15

	859 STORE_IN_OUTPUT 15, 9, 14, q9, q10

	860 ; --------------------------------------------------------------------------

	861 ; part of stage 6

	862 ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;

	863 ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;

	864 ;step3[10] = dct_const_round_shift(temp1);

	865 ;step3[13] = dct_const_round_shift(temp2);

	866 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

	867 STORE_IN_OUTPUT 14, 13, 10, q3, q1

	868 ; --------------------------------------------------------------------------

	869 ; part of stage 5

	870 ;step2[11] = step1b[8][i] - step1b[11][i];

	871 ;step2[12] = step1b[15][i] - step1b[12][i];

	872 vsub.s16 q13, q0, q5

	873 vsub.s16 q14, q2, q6

	874 ; --------------------------------------------------------------------------

	875 ; part of stage 6

	876 ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;

	877 ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;

	878 ;step3[11] = dct_const_round_shift(temp1);

	879 ;step3[12] = dct_const_round_shift(temp2);

	880 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

	881 STORE_IN_OUTPUT 10, 11, 12, q1, q3

	882 ; --------------------------------------------------------------------------

	883

	884

	885 ; --------------------------------------------------------------------------

	886 ; BLOCK D: 0-3,4-7

	887 ; --------------------------------------------------------------------------

	888 ; generate 4,5,6,7

	889 ; --------------------------------------------------------------------------

	890 ; part of stage 3

	891 ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;

	892 ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;

	893 ;step3[4] = dct_const_round_shift(temp1);

	894 ;step3[7] = dct_const_round_shift(temp2);

	895 LOAD_FROM_TRANSPOSED 6, 4, 28

	896 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5

	897 ; --------------------------------------------------------------------------

	898 ; part of stage 3

	899 ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;

	900 ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;

	901 ;step3[5] = dct_const_round_shift(temp1);

	902 ;step3[6] = dct_const_round_shift(temp2);

	903 LOAD_FROM_TRANSPOSED 28, 20, 12

	904 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7

	905 ; --------------------------------------------------------------------------

	906 ; part of stage 4

	907 ;step1[4] = step1b[4][i] + step1b[5][i];

	908 ;step1[5] = step1b[4][i] - step1b[5][i];

	909 ;step1[6] = step1b[7][i] - step1b[6][i];

	910 ;step1[7] = step1b[7][i] + step1b[6][i];

	911 vsub.s16 q13, q0, q1

	912 vadd.s16 q0, q0, q1

	913 vsub.s16 q14, q2, q3

	914 vadd.s16 q2, q2, q3

	915 ; --------------------------------------------------------------------------

	916 ; part of stage 5

	917 ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;

	918 ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;

	919 ;step2[5] = dct_const_round_shift(temp1);

	920 ;step2[6] = dct_const_round_shift(temp2);

	921 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

	922 ; --------------------------------------------------------------------------

	923 ; generate 0,1,2,3

	924 ; --------------------------------------------------------------------------

	925 ; part of stage 4

	926 ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;

	927 ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;

	928 ;step1[1] = dct_const_round_shift(temp1);

	929 ;step1[0] = dct_const_round_shift(temp2);

	930 LOAD_FROM_TRANSPOSED 12, 0, 16

	931 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15

	932 ; --------------------------------------------------------------------------

	933 ; part of stage 4

	934 ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;

	935 ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;

	936 ;step1[2] = dct_const_round_shift(temp1);

	937 ;step1[3] = dct_const_round_shift(temp2);

	938 LOAD_FROM_TRANSPOSED 16, 8, 24

	939 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13

	940 ; --------------------------------------------------------------------------

	941 ; part of stage 5

	942 ;step2[0] = step1b[0][i] + step1b[3][i];

	943 ;step2[1] = step1b[1][i] + step1b[2][i];

	944 ;step2[2] = step1b[1][i] - step1b[2][i];

	945 ;step2[3] = step1b[0][i] - step1b[3][i];

	946 vadd.s16 q4, q7, q6

	947 vsub.s16 q7, q7, q6

	948 vsub.s16 q6, q5, q14

	949 vadd.s16 q5, q5, q14

	950 ; --------------------------------------------------------------------------

	951 ; combine 0-3,4-7

	952 ; --------------------------------------------------------------------------

	953 ; part of stage 6

	954 ;step3[0] = step1b[0][i] + step1b[7][i];

	955 ;step3[1] = step1b[1][i] + step1b[6][i];

	956 ;step3[2] = step1b[2][i] + step1b[5][i];

	957 ;step3[3] = step1b[3][i] + step1b[4][i];

	958 vadd.s16 q8, q4, q2

	959 vadd.s16 q9, q5, q3

	960 vadd.s16 q10, q6, q1

	961 vadd.s16 q11, q7, q0

	962 ;step3[4] = step1b[3][i] - step1b[4][i];

	963 ;step3[5] = step1b[2][i] - step1b[5][i];

	964 ;step3[6] = step1b[1][i] - step1b[6][i];

	965 ;step3[7] = step1b[0][i] - step1b[7][i];

	966 vsub.s16 q12, q7, q0

	967 vsub.s16 q13, q6, q1

	968 vsub.s16 q14, q5, q3

	969 vsub.s16 q15, q4, q2

	970 ; --------------------------------------------------------------------------

	971 ; part of stage 7

	972 ;step1[0] = step1b[0][i] + step1b[15][i];

	973 ;step1[1] = step1b[1][i] + step1b[14][i];

	974 ;step1[14] = step1b[1][i] - step1b[14][i];

	975 ;step1[15] = step1b[0][i] - step1b[15][i];

	976 LOAD_FROM_OUTPUT 12, 14, 15, q0, q1

	977 vadd.s16 q2, q8, q1

	978 vadd.s16 q3, q9, q0

	979 vsub.s16 q4, q9, q0

	980 vsub.s16 q5, q8, q1

	981 ; --------------------------------------------------------------------------

	982 ; part of final stage

	983 ;output[14 * 32] = step1b[14][i] + step1b[17][i];

	984 ;output[15 * 32] = step1b[15][i] + step1b[16][i];

	985 ;output[16 * 32] = step1b[15][i] - step1b[16][i];

	986 ;output[17 * 32] = step1b[14][i] - step1b[17][i];

	987 LOAD_FROM_OUTPUT 15, 16, 17, q0, q1

	988 vadd.s16 q8, q4, q1

	989 vadd.s16 q9, q5, q0

	990 vsub.s16 q6, q5, q0

	991 vsub.s16 q7, q4, q1

	992

	993 cmp r5, #0

	994 bgt idct32_bands_end_2nd_pass

	995

	996 idct32_bands_end_1st_pass

	997 STORE_IN_OUTPUT 17, 16, 17, q6, q7

	998 STORE_IN_OUTPUT 17, 14, 15, q8, q9

	999 ; --------------------------------------------------------------------------

	1000 ; part of final stage

	1001 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];

	1002 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];

	1003 ;output[30 * 32] = step1b[1][i] - step1b[30][i];

	1004 ;output[31 * 32] = step1b[0][i] - step1b[31][i];

	1005 LOAD_FROM_OUTPUT 15, 30, 31, q0, q1

	1006 vadd.s16 q4, q2, q1

	1007 vadd.s16 q5, q3, q0

	1008 vsub.s16 q6, q3, q0

	1009 vsub.s16 q7, q2, q1

	1010 STORE_IN_OUTPUT 31, 30, 31, q6, q7

	1011 STORE_IN_OUTPUT 31, 0, 1, q4, q5

	1012 ; --------------------------------------------------------------------------

	1013 ; part of stage 7

	1014 ;step1[2] = step1b[2][i] + step1b[13][i];

	1015 ;step1[3] = step1b[3][i] + step1b[12][i];

	1016 ;step1[12] = step1b[3][i] - step1b[12][i];

	1017 ;step1[13] = step1b[2][i] - step1b[13][i];

	1018 LOAD_FROM_OUTPUT 1, 12, 13, q0, q1

	1019 vadd.s16 q2, q10, q1

	1020 vadd.s16 q3, q11, q0

	1021 vsub.s16 q4, q11, q0

	1022 vsub.s16 q5, q10, q1

	1023 ; --------------------------------------------------------------------------

	1024 ; part of final stage

	1025 ;output[12 * 32] = step1b[12][i] + step1b[19][i];

	1026 ;output[13 * 32] = step1b[13][i] + step1b[18][i];

	1027 ;output[18 * 32] = step1b[13][i] - step1b[18][i];

	1028 ;output[19 * 32] = step1b[12][i] - step1b[19][i];

	1029 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1

	1030 vadd.s16 q8, q4, q1

	1031 vadd.s16 q9, q5, q0

	1032 vsub.s16 q6, q5, q0

	1033 vsub.s16 q7, q4, q1

	1034 STORE_IN_OUTPUT 19, 18, 19, q6, q7

	1035 STORE_IN_OUTPUT 19, 12, 13, q8, q9

	1036 ; --------------------------------------------------------------------------

	1037 ; part of final stage

	1038 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];

	1039 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];

	1040 ;output[28 * 32] = step1b[3][i] - step1b[28][i];

	1041 ;output[29 * 32] = step1b[2][i] - step1b[29][i];

	1042 LOAD_FROM_OUTPUT 13, 28, 29, q0, q1

	1043 vadd.s16 q4, q2, q1

	1044 vadd.s16 q5, q3, q0

	1045 vsub.s16 q6, q3, q0

	1046 vsub.s16 q7, q2, q1

	1047 STORE_IN_OUTPUT 29, 28, 29, q6, q7

	1048 STORE_IN_OUTPUT 29, 2, 3, q4, q5

	1049 ; --------------------------------------------------------------------------

	1050 ; part of stage 7

	1051 ;step1[4] = step1b[4][i] + step1b[11][i];

	1052 ;step1[5] = step1b[5][i] + step1b[10][i];

	1053 ;step1[10] = step1b[5][i] - step1b[10][i];

	1054 ;step1[11] = step1b[4][i] - step1b[11][i];

	1055 LOAD_FROM_OUTPUT 3, 10, 11, q0, q1

	1056 vadd.s16 q2, q12, q1

	1057 vadd.s16 q3, q13, q0

	1058 vsub.s16 q4, q13, q0

	1059 vsub.s16 q5, q12, q1

	1060 ; --------------------------------------------------------------------------

	1061 ; part of final stage

	1062 ;output[10 * 32] = step1b[10][i] + step1b[21][i];

	1063 ;output[11 * 32] = step1b[11][i] + step1b[20][i];

	1064 ;output[20 * 32] = step1b[11][i] - step1b[20][i];

	1065 ;output[21 * 32] = step1b[10][i] - step1b[21][i];

	1066 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1

	1067 vadd.s16 q8, q4, q1

	1068 vadd.s16 q9, q5, q0

	1069 vsub.s16 q6, q5, q0

	1070 vsub.s16 q7, q4, q1

	1071 STORE_IN_OUTPUT 21, 20, 21, q6, q7

	1072 STORE_IN_OUTPUT 21, 10, 11, q8, q9

	1073 ; --------------------------------------------------------------------------

	1074 ; part of final stage

	1075 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];

	1076 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];

	1077 ;output[26 * 32] = step1b[5][i] - step1b[26][i];

	1078 ;output[27 * 32] = step1b[4][i] - step1b[27][i];

	1079 LOAD_FROM_OUTPUT 11, 26, 27, q0, q1

	1080 vadd.s16 q4, q2, q1

	1081 vadd.s16 q5, q3, q0

	1082 vsub.s16 q6, q3, q0

	1083 vsub.s16 q7, q2, q1

	1084 STORE_IN_OUTPUT 27, 26, 27, q6, q7

	1085 STORE_IN_OUTPUT 27, 4, 5, q4, q5

	1086 ; --------------------------------------------------------------------------

	1087 ; part of stage 7

	1088 ;step1[6] = step1b[6][i] + step1b[9][i];

	1089 ;step1[7] = step1b[7][i] + step1b[8][i];

	1090 ;step1[8] = step1b[7][i] - step1b[8][i];

	1091 ;step1[9] = step1b[6][i] - step1b[9][i];

	1092 LOAD_FROM_OUTPUT 5, 8, 9, q0, q1

	1093 vadd.s16 q2, q14, q1

	1094 vadd.s16 q3, q15, q0

	1095 vsub.s16 q4, q15, q0

	1096 vsub.s16 q5, q14, q1

	1097 ; --------------------------------------------------------------------------

	1098 ; part of final stage

	1099 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];

	1100 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];

	1101 ;output[22 * 32] = step1b[9][i] - step1b[22][i];

	1102 ;output[23 * 32] = step1b[8][i] - step1b[23][i];

	1103 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1

	1104 vadd.s16 q8, q4, q1

	1105 vadd.s16 q9, q5, q0

	1106 vsub.s16 q6, q5, q0

	1107 vsub.s16 q7, q4, q1

	1108 STORE_IN_OUTPUT 23, 22, 23, q6, q7

	1109 STORE_IN_OUTPUT 23, 8, 9, q8, q9

	1110 ; --------------------------------------------------------------------------

	1111 ; part of final stage

	1112 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];

	1113 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];

	1114 ;output[24 * 32] = step1b[7][i] - step1b[24][i];

	1115 ;output[25 * 32] = step1b[6][i] - step1b[25][i];

	1116 LOAD_FROM_OUTPUT 9, 24, 25, q0, q1

	1117 vadd.s16 q4, q2, q1

	1118 vadd.s16 q5, q3, q0

	1119 vsub.s16 q6, q3, q0

	1120 vsub.s16 q7, q2, q1

	1121 STORE_IN_OUTPUT 25, 24, 25, q6, q7

	1122 STORE_IN_OUTPUT 25, 6, 7, q4, q5

	1123

	1124 ; restore r0 by removing the last offset from the last

	1125 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 2482

	1126 sub r0, r0, #2482

	1127 ; restore r1 by removing the last offset from the last

	1128 ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7322

	1129 ; advance by 8 columns => 8*2

	1130 sub r1, r1, #7322 - 8*2

	1131 ; advance by 8 lines (8322)

	1132 ; go back by the two pairs from the loop (32*2)

	1133 add r3, r3, #8322 - 32*2

	1134

	1135 ; bands loop processing

	1136 subs r4, r4, #1

	1137 bne idct32_bands_loop

	1138

	1139 ; parameters for second pass

	1140 ; the input of pass2 is the result of pass1. we have to remove the offset

	1141 ; of 32 columns induced by the above idct32_bands_loop

	1142 sub r3, r1, #32*2

	1143 ; r1 = pass2[32 * 32]

	1144 add r1, sp, #2048

	1145

	1146 ; pass loop processing

	1147 add r5, r5, #1

	1148 b idct32_pass_loop

	1149

	1150 idct32_bands_end_2nd_pass

	1151 STORE_COMBINE_CENTER_RESULTS

	1152 ; --------------------------------------------------------------------------

	1153 ; part of final stage

	1154 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];

	1155 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];

	1156 ;output[30 * 32] = step1b[1][i] - step1b[30][i];

	1157 ;output[31 * 32] = step1b[0][i] - step1b[31][i];

	1158 LOAD_FROM_OUTPUT 17, 30, 31, q0, q1

	1159 vadd.s16 q4, q2, q1

	1160 vadd.s16 q5, q3, q0

	1161 vsub.s16 q6, q3, q0

	1162 vsub.s16 q7, q2, q1

	1163 STORE_COMBINE_EXTREME_RESULTS

	1164 ; --------------------------------------------------------------------------

	1165 ; part of stage 7

	1166 ;step1[2] = step1b[2][i] + step1b[13][i];

	1167 ;step1[3] = step1b[3][i] + step1b[12][i];

	1168 ;step1[12] = step1b[3][i] - step1b[12][i];

	1169 ;step1[13] = step1b[2][i] - step1b[13][i];

	1170 LOAD_FROM_OUTPUT 31, 12, 13, q0, q1

	1171 vadd.s16 q2, q10, q1

	1172 vadd.s16 q3, q11, q0

	1173 vsub.s16 q4, q11, q0

	1174 vsub.s16 q5, q10, q1

	1175 ; --------------------------------------------------------------------------

	1176 ; part of final stage

	1177 ;output[12 * 32] = step1b[12][i] + step1b[19][i];

	1178 ;output[13 * 32] = step1b[13][i] + step1b[18][i];

	1179 ;output[18 * 32] = step1b[13][i] - step1b[18][i];

	1180 ;output[19 * 32] = step1b[12][i] - step1b[19][i];

	1181 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1

	1182 vadd.s16 q8, q4, q1

	1183 vadd.s16 q9, q5, q0

	1184 vsub.s16 q6, q5, q0

	1185 vsub.s16 q7, q4, q1

	1186 STORE_COMBINE_CENTER_RESULTS

	1187 ; --------------------------------------------------------------------------

	1188 ; part of final stage

	1189 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];

	1190 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];

	1191 ;output[28 * 32] = step1b[3][i] - step1b[28][i];

	1192 ;output[29 * 32] = step1b[2][i] - step1b[29][i];

	1193 LOAD_FROM_OUTPUT 19, 28, 29, q0, q1

	1194 vadd.s16 q4, q2, q1

	1195 vadd.s16 q5, q3, q0

	1196 vsub.s16 q6, q3, q0

	1197 vsub.s16 q7, q2, q1

	1198 STORE_COMBINE_EXTREME_RESULTS

	1199 ; --------------------------------------------------------------------------

	1200 ; part of stage 7

	1201 ;step1[4] = step1b[4][i] + step1b[11][i];

	1202 ;step1[5] = step1b[5][i] + step1b[10][i];

	1203 ;step1[10] = step1b[5][i] - step1b[10][i];

	1204 ;step1[11] = step1b[4][i] - step1b[11][i];

	1205 LOAD_FROM_OUTPUT 29, 10, 11, q0, q1

	1206 vadd.s16 q2, q12, q1

	1207 vadd.s16 q3, q13, q0

	1208 vsub.s16 q4, q13, q0

	1209 vsub.s16 q5, q12, q1

	1210 ; --------------------------------------------------------------------------

	1211 ; part of final stage

	1212 ;output[10 * 32] = step1b[10][i] + step1b[21][i];

	1213 ;output[11 * 32] = step1b[11][i] + step1b[20][i];

	1214 ;output[20 * 32] = step1b[11][i] - step1b[20][i];

	1215 ;output[21 * 32] = step1b[10][i] - step1b[21][i];

	1216 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1

	1217 vadd.s16 q8, q4, q1

	1218 vadd.s16 q9, q5, q0

	1219 vsub.s16 q6, q5, q0

	1220 vsub.s16 q7, q4, q1

	1221 STORE_COMBINE_CENTER_RESULTS

	1222 ; --------------------------------------------------------------------------

	1223 ; part of final stage

	1224 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];

	1225 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];

	1226 ;output[26 * 32] = step1b[5][i] - step1b[26][i];

	1227 ;output[27 * 32] = step1b[4][i] - step1b[27][i];

	1228 LOAD_FROM_OUTPUT 21, 26, 27, q0, q1

	1229 vadd.s16 q4, q2, q1

	1230 vadd.s16 q5, q3, q0

	1231 vsub.s16 q6, q3, q0

	1232 vsub.s16 q7, q2, q1

	1233 STORE_COMBINE_EXTREME_RESULTS

	1234 ; --------------------------------------------------------------------------

	1235 ; part of stage 7

	1236 ;step1[6] = step1b[6][i] + step1b[9][i];

	1237 ;step1[7] = step1b[7][i] + step1b[8][i];

	1238 ;step1[8] = step1b[7][i] - step1b[8][i];

	1239 ;step1[9] = step1b[6][i] - step1b[9][i];

	1240 LOAD_FROM_OUTPUT 27, 8, 9, q0, q1

	1241 vadd.s16 q2, q14, q1

	1242 vadd.s16 q3, q15, q0

	1243 vsub.s16 q4, q15, q0

	1244 vsub.s16 q5, q14, q1

	1245 ; --------------------------------------------------------------------------

	1246 ; part of final stage

	1247 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];

	1248 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];

	1249 ;output[22 * 32] = step1b[9][i] - step1b[22][i];

	1250 ;output[23 * 32] = step1b[8][i] - step1b[23][i];

	1251 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1

	1252 vadd.s16 q8, q4, q1

	1253 vadd.s16 q9, q5, q0

	1254 vsub.s16 q6, q5, q0

	1255 vsub.s16 q7, q4, q1

	1256 STORE_COMBINE_CENTER_RESULTS_LAST

	1257 ; --------------------------------------------------------------------------

	1258 ; part of final stage

	1259 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];

	1260 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];

	1261 ;output[24 * 32] = step1b[7][i] - step1b[24][i];

	1262 ;output[25 * 32] = step1b[6][i] - step1b[25][i];

	1263 LOAD_FROM_OUTPUT 23, 24, 25, q0, q1

	1264 vadd.s16 q4, q2, q1

	1265 vadd.s16 q5, q3, q0

	1266 vsub.s16 q6, q3, q0

	1267 vsub.s16 q7, q2, q1

	1268 STORE_COMBINE_EXTREME_RESULTS_LAST

	1269 ; --------------------------------------------------------------------------

	1270 ; restore pointers to their initial indices for next band pass by

	1271 ; removing/adding dest_stride * 8. The actual increment by eight

	1272 ; is taken care of within the _LAST macros.

	1273 add r6, r6, r2, lsl #3

	1274 add r9, r9, r2, lsl #3

	1275 sub r7, r7, r2, lsl #3

	1276 sub r10, r10, r2, lsl #3

	1277

	1278 ; restore r0 by removing the last offset from the last

	1279 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 2482

	1280 sub r0, r0, #2482

	1281 ; restore r1 by removing the last offset from the last

	1282 ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25322

	1283 ; advance by 8 columns => 8*2

	1284 sub r1, r1, #25322 - 8*2

	1285 ; advance by 8 lines (8322)

	1286 ; go back by the two pairs from the loop (32*2)

	1287 add r3, r3, #8322 - 32*2

	1288

	1289 ; bands loop processing

	1290 subs r4, r4, #1

	1291 bne idct32_bands_loop

	1292

	1293 ; stack operation

	1294 add sp, sp, #512+2048+2048

	1295 vpop {d8-d15}

	1296 pop {r4-r11}

	1297 bx lr

	1298 ENDP ; \|vp9_idct32x32_1024_add_neon\|

	1299 END

OLD	NEW