source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

11 EXPORT \|vp9_idct16x16_256_add_neon_pass1\|	11 EXPORT \|vpx_idct16x16_256_add_neon_pass1\|

12 EXPORT \|vp9_idct16x16_256_add_neon_pass2\|	12 EXPORT \|vpx_idct16x16_256_add_neon_pass2\|

13 EXPORT \|vp9_idct16x16_10_add_neon_pass1\|	13 EXPORT \|vpx_idct16x16_10_add_neon_pass1\|

14 EXPORT \|vp9_idct16x16_10_add_neon_pass2\|	14 EXPORT \|vpx_idct16x16_10_add_neon_pass2\|

15 ARM	15 ARM

16 REQUIRE8	16 REQUIRE8

17 PRESERVE8	17 PRESERVE8

18	18

19 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2	19 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

20	20

21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.	21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.

22 MACRO	22 MACRO

23 TRANSPOSE8X8	23 TRANSPOSE8X8

24 vswp d17, d24	24 vswp d17, d24

25 vswp d23, d30	25 vswp d23, d30

26 vswp d21, d28	26 vswp d21, d28

27 vswp d19, d26	27 vswp d19, d26

28 vtrn.32 q8, q10	28 vtrn.32 q8, q10

29 vtrn.32 q9, q11	29 vtrn.32 q9, q11

30 vtrn.32 q12, q14	30 vtrn.32 q12, q14

31 vtrn.32 q13, q15	31 vtrn.32 q13, q15

32 vtrn.16 q8, q9	32 vtrn.16 q8, q9

33 vtrn.16 q10, q11	33 vtrn.16 q10, q11

34 vtrn.16 q12, q13	34 vtrn.16 q12, q13

35 vtrn.16 q14, q15	35 vtrn.16 q14, q15

36 MEND	36 MEND

37	37

38 AREA Block, CODE, READONLY ; name this block of code	38 AREA Block, CODE, READONLY ; name this block of code

39 ;void \|vp9_idct16x16_256_add_neon_pass1\|(int16_t *input,	39 ;void \|vpx_idct16x16_256_add_neon_pass1\|(int16_t *input,

40 ; int16_t *output, int output_stride)	40 ; int16_t *output, int output_stride)

41 ;	41 ;

42 ; r0 int16_t input	42 ; r0 int16_t input

43 ; r1 int16_t *output	43 ; r1 int16_t *output

44 ; r2 int output_stride)	44 ; r2 int output_stride)

45	45

46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output	46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

47 ; will be stored back into q8-q15 registers. This function will touch q0-q7	47 ; will be stored back into q8-q15 registers. This function will touch q0-q7

48 ; registers and use them as buffer during calculation.	48 ; registers and use them as buffer during calculation.

49 \|vp9_idct16x16_256_add_neon_pass1\| PROC	49 \|vpx_idct16x16_256_add_neon_pass1\| PROC

50	50

51 ; TODO(hkuang): Find a better way to load the elements.	51 ; TODO(hkuang): Find a better way to load the elements.

52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15	52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

53 vld2.s16 {q8,q9}, [r0]!	53 vld2.s16 {q8,q9}, [r0]!

54 vld2.s16 {q9,q10}, [r0]!	54 vld2.s16 {q9,q10}, [r0]!

55 vld2.s16 {q10,q11}, [r0]!	55 vld2.s16 {q10,q11}, [r0]!

56 vld2.s16 {q11,q12}, [r0]!	56 vld2.s16 {q11,q12}, [r0]!

57 vld2.s16 {q12,q13}, [r0]!	57 vld2.s16 {q12,q13}, [r0]!

58 vld2.s16 {q13,q14}, [r0]!	58 vld2.s16 {q13,q14}, [r0]!

59 vld2.s16 {q14,q15}, [r0]!	59 vld2.s16 {q14,q15}, [r0]!

(...skipping 206 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
266 vst1.64 {d24}, [r1], r2	266 vst1.64 {d24}, [r1], r2

267 vst1.64 {d25}, [r1], r2	267 vst1.64 {d25}, [r1], r2

268 vst1.64 {d26}, [r1], r2	268 vst1.64 {d26}, [r1], r2

269 vst1.64 {d27}, [r1], r2	269 vst1.64 {d27}, [r1], r2

270 vst1.64 {d28}, [r1], r2	270 vst1.64 {d28}, [r1], r2

271 vst1.64 {d29}, [r1], r2	271 vst1.64 {d29}, [r1], r2

272 vst1.64 {d30}, [r1], r2	272 vst1.64 {d30}, [r1], r2

273 vst1.64 {d31}, [r1], r2	273 vst1.64 {d31}, [r1], r2

274	274

275 bx lr	275 bx lr

276 ENDP ; \|vp9_idct16x16_256_add_neon_pass1\|	276 ENDP ; \|vpx_idct16x16_256_add_neon_pass1\|

277	277

278 ;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,	278 ;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,

279 ; int16_t *output,	279 ; int16_t *output,

280 ; int16_t *pass1Output,	280 ; int16_t *pass1Output,

281 ; int16_t skip_adding,	281 ; int16_t skip_adding,

282 ; uint8_t *dest,	282 ; uint8_t *dest,

283 ; int dest_stride)	283 ; int dest_stride)

284 ;	284 ;

285 ; r0 int16_t *src	285 ; r0 int16_t *src

286 ; r1 int16_t *output,	286 ; r1 int16_t *output,

287 ; r2 int16_t *pass1Output,	287 ; r2 int16_t *pass1Output,

288 ; r3 int16_t skip_adding,	288 ; r3 int16_t skip_adding,

289 ; r4 uint8_t *dest,	289 ; r4 uint8_t *dest,

290 ; r5 int dest_stride)	290 ; r5 int dest_stride)

291	291

292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output	292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

293 ; will be stored back into q8-q15 registers. This function will touch q0-q7	293 ; will be stored back into q8-q15 registers. This function will touch q0-q7

294 ; registers and use them as buffer during calculation.	294 ; registers and use them as buffer during calculation.

295 \|vp9_idct16x16_256_add_neon_pass2\| PROC	295 \|vpx_idct16x16_256_add_neon_pass2\| PROC

296 push {r3-r9}	296 push {r3-r9}

297	297

298 ; TODO(hkuang): Find a better way to load the elements.	298 ; TODO(hkuang): Find a better way to load the elements.

299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15	299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15

300 vld2.s16 {q8,q9}, [r0]!	300 vld2.s16 {q8,q9}, [r0]!

301 vld2.s16 {q9,q10}, [r0]!	301 vld2.s16 {q9,q10}, [r0]!

302 vld2.s16 {q10,q11}, [r0]!	302 vld2.s16 {q10,q11}, [r0]!

303 vld2.s16 {q11,q12}, [r0]!	303 vld2.s16 {q11,q12}, [r0]!

304 vld2.s16 {q12,q13}, [r0]!	304 vld2.s16 {q12,q13}, [r0]!

305 vld2.s16 {q13,q14}, [r0]!	305 vld2.s16 {q13,q14}, [r0]!

(...skipping 471 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
777 vst1.64 {d9}, [r1], r5	777 vst1.64 {d9}, [r1], r5

778 vst1.64 {d10}, [r1], r3	778 vst1.64 {d10}, [r1], r3

779 vst1.64 {d11}, [r1], r5	779 vst1.64 {d11}, [r1], r5

780 vst1.64 {d28}, [r1], r3	780 vst1.64 {d28}, [r1], r3

781 vst1.64 {d29}, [r1], r5	781 vst1.64 {d29}, [r1], r5

782 vst1.64 {d30}, [r1], r3	782 vst1.64 {d30}, [r1], r3

783 vst1.64 {d31}, [r1], r5	783 vst1.64 {d31}, [r1], r5

784 end_idct16x16_pass2	784 end_idct16x16_pass2

785 pop {r3-r9}	785 pop {r3-r9}

786 bx lr	786 bx lr

787 ENDP ; \|vp9_idct16x16_256_add_neon_pass2\|	787 ENDP ; \|vpx_idct16x16_256_add_neon_pass2\|

788	788

789 ;void \|vp9_idct16x16_10_add_neon_pass1\|(int16_t *input,	789 ;void \|vpx_idct16x16_10_add_neon_pass1\|(int16_t *input,

790 ; int16_t *output, int output_stride )	790 ; int16_t *output, int output_stride )

791 ;	791 ;

792 ; r0 int16_t input	792 ; r0 int16_t input

793 ; r1 int16_t *output	793 ; r1 int16_t *output

794 ; r2 int output_stride)	794 ; r2 int output_stride)

795	795

796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output	796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

797 ; will be stored back into q8-q15 registers. This function will touch q0-q7	797 ; will be stored back into q8-q15 registers. This function will touch q0-q7

798 ; registers and use them as buffer during calculation.	798 ; registers and use them as buffer during calculation.

799 \|vp9_idct16x16_10_add_neon_pass1\| PROC	799 \|vpx_idct16x16_10_add_neon_pass1\| PROC

800	800

801 ; TODO(hkuang): Find a better way to load the elements.	801 ; TODO(hkuang): Find a better way to load the elements.

802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15	802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

803 vld2.s16 {q8,q9}, [r0]!	803 vld2.s16 {q8,q9}, [r0]!

804 vld2.s16 {q9,q10}, [r0]!	804 vld2.s16 {q9,q10}, [r0]!

805 vld2.s16 {q10,q11}, [r0]!	805 vld2.s16 {q10,q11}, [r0]!

806 vld2.s16 {q11,q12}, [r0]!	806 vld2.s16 {q11,q12}, [r0]!

807 vld2.s16 {q12,q13}, [r0]!	807 vld2.s16 {q12,q13}, [r0]!

808 vld2.s16 {q13,q14}, [r0]!	808 vld2.s16 {q13,q14}, [r0]!

809 vld2.s16 {q14,q15}, [r0]!	809 vld2.s16 {q14,q15}, [r0]!

(...skipping 88 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
898 vst1.64 {d24}, [r1], r2	898 vst1.64 {d24}, [r1], r2

899 vst1.64 {d25}, [r1], r2	899 vst1.64 {d25}, [r1], r2

900 vst1.64 {d26}, [r1], r2	900 vst1.64 {d26}, [r1], r2

901 vst1.64 {d27}, [r1], r2	901 vst1.64 {d27}, [r1], r2

902 vst1.64 {d28}, [r1], r2	902 vst1.64 {d28}, [r1], r2

903 vst1.64 {d29}, [r1], r2	903 vst1.64 {d29}, [r1], r2

904 vst1.64 {d30}, [r1], r2	904 vst1.64 {d30}, [r1], r2

905 vst1.64 {d31}, [r1], r2	905 vst1.64 {d31}, [r1], r2

906	906

907 bx lr	907 bx lr

908 ENDP ; \|vp9_idct16x16_10_add_neon_pass1\|	908 ENDP ; \|vpx_idct16x16_10_add_neon_pass1\|

909	909

910 ;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,	910 ;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,

911 ; int16_t *output,	911 ; int16_t *output,

912 ; int16_t *pass1Output,	912 ; int16_t *pass1Output,

913 ; int16_t skip_adding,	913 ; int16_t skip_adding,

914 ; uint8_t *dest,	914 ; uint8_t *dest,

915 ; int dest_stride)	915 ; int dest_stride)

916 ;	916 ;

917 ; r0 int16_t *src	917 ; r0 int16_t *src

918 ; r1 int16_t *output,	918 ; r1 int16_t *output,

919 ; r2 int16_t *pass1Output,	919 ; r2 int16_t *pass1Output,

920 ; r3 int16_t skip_adding,	920 ; r3 int16_t skip_adding,

921 ; r4 uint8_t *dest,	921 ; r4 uint8_t *dest,

922 ; r5 int dest_stride)	922 ; r5 int dest_stride)

923	923

924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output	924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

925 ; will be stored back into q8-q15 registers. This function will touch q0-q7	925 ; will be stored back into q8-q15 registers. This function will touch q0-q7

926 ; registers and use them as buffer during calculation.	926 ; registers and use them as buffer during calculation.

927 \|vp9_idct16x16_10_add_neon_pass2\| PROC	927 \|vpx_idct16x16_10_add_neon_pass2\| PROC

928 push {r3-r9}	928 push {r3-r9}

929	929

930 ; TODO(hkuang): Find a better way to load the elements.	930 ; TODO(hkuang): Find a better way to load the elements.

931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15	931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15

932 vld2.s16 {q8,q9}, [r0]!	932 vld2.s16 {q8,q9}, [r0]!

933 vld2.s16 {q9,q10}, [r0]!	933 vld2.s16 {q9,q10}, [r0]!

934 vld2.s16 {q10,q11}, [r0]!	934 vld2.s16 {q10,q11}, [r0]!

935 vld2.s16 {q11,q12}, [r0]!	935 vld2.s16 {q11,q12}, [r0]!

936 vld2.s16 {q12,q13}, [r0]!	936 vld2.s16 {q12,q13}, [r0]!

937 vld2.s16 {q13,q14}, [r0]!	937 vld2.s16 {q13,q14}, [r0]!

(...skipping 230 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1168 vst1.64 {d9}, [r1], r5	1168 vst1.64 {d9}, [r1], r5

1169 vst1.64 {d10}, [r1], r3	1169 vst1.64 {d10}, [r1], r3

1170 vst1.64 {d11}, [r1], r5	1170 vst1.64 {d11}, [r1], r5

1171 vst1.64 {d28}, [r1], r3	1171 vst1.64 {d28}, [r1], r3

1172 vst1.64 {d29}, [r1], r5	1172 vst1.64 {d29}, [r1], r5

1173 vst1.64 {d30}, [r1], r3	1173 vst1.64 {d30}, [r1], r3

1174 vst1.64 {d31}, [r1], r5	1174 vst1.64 {d31}, [r1], r5

1175 end_idct10_16x16_pass2	1175 end_idct10_16x16_pass2

1176 pop {r3-r9}	1176 pop {r3-r9}

1177 bx lr	1177 bx lr

1178 ENDP ; \|vp9_idct16x16_10_add_neon_pass2\|	1178 ENDP ; \|vpx_idct16x16_10_add_neon_pass2\|

1179 END	1179 END

OLD	NEW

« no previous file with comments | « source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c ('k') | source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c » ('j') | no next file with comments »