source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm - Issue 181493009: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm

Issue 181493009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
65 vld1.s16 {q14}, [r0]	65 vld1.s16 {q14}, [r0]

66 add r0, #($second_offset - $first_offset)82	66 add r0, #($second_offset - $first_offset)82

67 vld1.s16 {q13}, [r0]	67 vld1.s16 {q13}, [r0]

68 ; (used) two registers (q14, q13)	68 ; (used) two registers (q14, q13)

69 MEND	69 MEND

70 ; --------------------------------------------------------------------------	70 ; --------------------------------------------------------------------------

71 ; Load from output (used as temporary storage)	71 ; Load from output (used as temporary storage)

72 ; reg1 = output[first_offset]	72 ; reg1 = output[first_offset]

73 ; reg2 = output[second_offset]	73 ; reg2 = output[second_offset]

74 ; for proper address calculation, the last offset used when manipulating	74 ; for proper address calculation, the last offset used when manipulating

75 ; output, wethere reading or storing) must be passed in. use 0 for first	75 ; output, whether reading or storing) must be passed in. use 0 for first

76 ; use.	76 ; use.

77 MACRO	77 MACRO

78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2	78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

79 ; address calculation with proper stride and loading	79 ; address calculation with proper stride and loading

80 add r1, #($first_offset - $prev_offset )322	80 add r1, #($first_offset - $prev_offset )322

81 vld1.s16 {$reg1}, [r1]	81 vld1.s16 {$reg1}, [r1]

82 add r1, #($second_offset - $first_offset)322	82 add r1, #($second_offset - $first_offset)322

83 vld1.s16 {$reg2}, [r1]	83 vld1.s16 {$reg2}, [r1]

84 ; (used) two registers ($reg1, $reg2)	84 ; (used) two registers ($reg1, $reg2)

85 MEND	85 MEND

86 ; --------------------------------------------------------------------------	86 ; --------------------------------------------------------------------------

87 ; Store into output (sometimes as as temporary storage)	87 ; Store into output (sometimes as as temporary storage)

88 ; output[first_offset] = reg1	88 ; output[first_offset] = reg1

89 ; output[second_offset] = reg2	89 ; output[second_offset] = reg2

90 ; for proper address calculation, the last offset used when manipulating	90 ; for proper address calculation, the last offset used when manipulating

91 ; output, wethere reading or storing) must be passed in. use 0 for first	91 ; output, whether reading or storing) must be passed in. use 0 for first

92 ; use.	92 ; use.

93 MACRO	93 MACRO

94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2	94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

95 ; address calculation with proper stride and storing	95 ; address calculation with proper stride and storing

96 add r1, #($first_offset - $prev_offset )322	96 add r1, #($first_offset - $prev_offset )322

97 vst1.16 {$reg1}, [r1]	97 vst1.16 {$reg1}, [r1]

98 add r1, #($second_offset - $first_offset)322	98 add r1, #($second_offset - $first_offset)322

99 vst1.16 {$reg2}, [r1]	99 vst1.16 {$reg2}, [r1]

100 MEND	100 MEND

101 ; --------------------------------------------------------------------------	101 ; --------------------------------------------------------------------------

(...skipping 133 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
235 add r7, r7, r2, lsl #1	235 add r7, r7, r2, lsl #1

236 MEND	236 MEND

237 ; --------------------------------------------------------------------------	237 ; --------------------------------------------------------------------------

238 ; Touches q8-q12, q15 (q13-q14 are preserved)	238 ; Touches q8-q12, q15 (q13-q14 are preserved)

239 ; valid output registers are anything but q8-q11	239 ; valid output registers are anything but q8-q11

240 MACRO	240 MACRO

241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4	241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

242 ; TODO(cd): have special case to re-use constants when they are similar for	242 ; TODO(cd): have special case to re-use constants when they are similar for

243 ; consecutive butterflies	243 ; consecutive butterflies

244 ; TODO(cd): have special case when both constants are the same, do the	244 ; TODO(cd): have special case when both constants are the same, do the

245 ; additions/substractions before the multiplies.	245 ; additions/subtractions before the multiplies.

246 ; generate the constants	246 ; generate the constants

247 ; generate scalar constants	247 ; generate scalar constants

248 mov r8, #$first_constant & 0xFF00	248 mov r8, #$first_constant & 0xFF00

249 mov r12, #$second_constant & 0xFF00	249 mov r12, #$second_constant & 0xFF00

250 add r8, #$first_constant & 0x00FF	250 add r8, #$first_constant & 0x00FF

251 add r12, #$second_constant & 0x00FF	251 add r12, #$second_constant & 0x00FF

252 ; generate vector constants	252 ; generate vector constants

253 vdup.16 d30, r8	253 vdup.16 d30, r8

254 vdup.16 d31, r12	254 vdup.16 d31, r12

255 ; (used) two for inputs (regA-regD), one for constants (q15)	255 ; (used) two for inputs (regA-regD), one for constants (q15)

256 ; do some multiplications (ordered for maximum latency hiding)	256 ; do some multiplications (ordered for maximum latency hiding)

257 vmull.s16 q8, $regC, d30	257 vmull.s16 q8, $regC, d30

258 vmull.s16 q10, $regA, d31	258 vmull.s16 q10, $regA, d31

259 vmull.s16 q9, $regD, d30	259 vmull.s16 q9, $regD, d30

260 vmull.s16 q11, $regB, d31	260 vmull.s16 q11, $regB, d31

261 vmull.s16 q12, $regC, d31	261 vmull.s16 q12, $regC, d31

262 ; (used) five for intermediate (q8-q12), one for constants (q15)	262 ; (used) five for intermediate (q8-q12), one for constants (q15)

263 ; do some addition/substractions (to get back two register)	263 ; do some addition/subtractions (to get back two register)

264 vsub.s32 q8, q8, q10	264 vsub.s32 q8, q8, q10

265 vsub.s32 q9, q9, q11	265 vsub.s32 q9, q9, q11

266 ; do more multiplications (ordered for maximum latency hiding)	266 ; do more multiplications (ordered for maximum latency hiding)

267 vmull.s16 q10, $regD, d31	267 vmull.s16 q10, $regD, d31

268 vmull.s16 q11, $regA, d30	268 vmull.s16 q11, $regA, d30

269 vmull.s16 q15, $regB, d30	269 vmull.s16 q15, $regB, d30

270 ; (used) six for intermediate (q8-q12, q15)	270 ; (used) six for intermediate (q8-q12, q15)

271 ; do more addition/substractions	271 ; do more addition/subtractions

272 vadd.s32 q11, q12, q11	272 vadd.s32 q11, q12, q11

273 vadd.s32 q10, q10, q15	273 vadd.s32 q10, q10, q15

274 ; (used) four for intermediate (q8-q11)	274 ; (used) four for intermediate (q8-q11)

275 ; dct_const_round_shift	275 ; dct_const_round_shift

276 vqrshrn.s32 $reg1, q8, #14	276 vqrshrn.s32 $reg1, q8, #14

277 vqrshrn.s32 $reg2, q9, #14	277 vqrshrn.s32 $reg2, q9, #14

278 vqrshrn.s32 $reg3, q11, #14	278 vqrshrn.s32 $reg3, q11, #14

279 vqrshrn.s32 $reg4, q10, #14	279 vqrshrn.s32 $reg4, q10, #14

280 ; (used) two for results, well four d registers	280 ; (used) two for results, well four d registers

281 MEND	281 MEND

(...skipping 1008 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1290 subs r4, r4, #1	1290 subs r4, r4, #1

1291 bne idct32_bands_loop	1291 bne idct32_bands_loop

1292	1292

1293 ; stack operation	1293 ; stack operation

1294 add sp, sp, #512+2048+2048	1294 add sp, sp, #512+2048+2048

1295 vpop {d8-d15}	1295 vpop {d8-d15}

1296 pop {r4-r11}	1296 pop {r4-r11}

1297 bx lr	1297 bx lr

1298 ENDP ; \|vp9_idct32x32_1024_add_neon\|	1298 ENDP ; \|vp9_idct32x32_1024_add_neon\|

1299 END	1299 END

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/vp8cx.mk ('k') | source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm » ('j') | no next file with comments »