OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 EXPORT |vp9_idct16x16_256_add_neon_pass1| | 11 EXPORT |vpx_idct16x16_256_add_neon_pass1| |
12 EXPORT |vp9_idct16x16_256_add_neon_pass2| | 12 EXPORT |vpx_idct16x16_256_add_neon_pass2| |
13 EXPORT |vp9_idct16x16_10_add_neon_pass1| | 13 EXPORT |vpx_idct16x16_10_add_neon_pass1| |
14 EXPORT |vp9_idct16x16_10_add_neon_pass2| | 14 EXPORT |vpx_idct16x16_10_add_neon_pass2| |
15 ARM | 15 ARM |
16 REQUIRE8 | 16 REQUIRE8 |
17 PRESERVE8 | 17 PRESERVE8 |
18 | 18 |
19 AREA ||.text||, CODE, READONLY, ALIGN=2 | 19 AREA ||.text||, CODE, READONLY, ALIGN=2 |
20 | 20 |
21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. | 21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. |
22 MACRO | 22 MACRO |
23 TRANSPOSE8X8 | 23 TRANSPOSE8X8 |
24 vswp d17, d24 | 24 vswp d17, d24 |
25 vswp d23, d30 | 25 vswp d23, d30 |
26 vswp d21, d28 | 26 vswp d21, d28 |
27 vswp d19, d26 | 27 vswp d19, d26 |
28 vtrn.32 q8, q10 | 28 vtrn.32 q8, q10 |
29 vtrn.32 q9, q11 | 29 vtrn.32 q9, q11 |
30 vtrn.32 q12, q14 | 30 vtrn.32 q12, q14 |
31 vtrn.32 q13, q15 | 31 vtrn.32 q13, q15 |
32 vtrn.16 q8, q9 | 32 vtrn.16 q8, q9 |
33 vtrn.16 q10, q11 | 33 vtrn.16 q10, q11 |
34 vtrn.16 q12, q13 | 34 vtrn.16 q12, q13 |
35 vtrn.16 q14, q15 | 35 vtrn.16 q14, q15 |
36 MEND | 36 MEND |
37 | 37 |
38 AREA Block, CODE, READONLY ; name this block of code | 38 AREA Block, CODE, READONLY ; name this block of code |
39 ;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, | 39 ;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input, |
40 ; int16_t *output, int output_stride) | 40 ; int16_t *output, int output_stride) |
41 ; | 41 ; |
42 ; r0 int16_t input | 42 ; r0 int16_t input |
43 ; r1 int16_t *output | 43 ; r1 int16_t *output |
44 ; r2 int output_stride) | 44 ; r2 int output_stride) |
45 | 45 |
46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output | 46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output |
47 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 47 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
48 ; registers and use them as buffer during calculation. | 48 ; registers and use them as buffer during calculation. |
49 |vp9_idct16x16_256_add_neon_pass1| PROC | 49 |vpx_idct16x16_256_add_neon_pass1| PROC |
50 | 50 |
51 ; TODO(hkuang): Find a better way to load the elements. | 51 ; TODO(hkuang): Find a better way to load the elements. |
52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 | 52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 |
53 vld2.s16 {q8,q9}, [r0]! | 53 vld2.s16 {q8,q9}, [r0]! |
54 vld2.s16 {q9,q10}, [r0]! | 54 vld2.s16 {q9,q10}, [r0]! |
55 vld2.s16 {q10,q11}, [r0]! | 55 vld2.s16 {q10,q11}, [r0]! |
56 vld2.s16 {q11,q12}, [r0]! | 56 vld2.s16 {q11,q12}, [r0]! |
57 vld2.s16 {q12,q13}, [r0]! | 57 vld2.s16 {q12,q13}, [r0]! |
58 vld2.s16 {q13,q14}, [r0]! | 58 vld2.s16 {q13,q14}, [r0]! |
59 vld2.s16 {q14,q15}, [r0]! | 59 vld2.s16 {q14,q15}, [r0]! |
(...skipping 206 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
266 vst1.64 {d24}, [r1], r2 | 266 vst1.64 {d24}, [r1], r2 |
267 vst1.64 {d25}, [r1], r2 | 267 vst1.64 {d25}, [r1], r2 |
268 vst1.64 {d26}, [r1], r2 | 268 vst1.64 {d26}, [r1], r2 |
269 vst1.64 {d27}, [r1], r2 | 269 vst1.64 {d27}, [r1], r2 |
270 vst1.64 {d28}, [r1], r2 | 270 vst1.64 {d28}, [r1], r2 |
271 vst1.64 {d29}, [r1], r2 | 271 vst1.64 {d29}, [r1], r2 |
272 vst1.64 {d30}, [r1], r2 | 272 vst1.64 {d30}, [r1], r2 |
273 vst1.64 {d31}, [r1], r2 | 273 vst1.64 {d31}, [r1], r2 |
274 | 274 |
275 bx lr | 275 bx lr |
276 ENDP ; |vp9_idct16x16_256_add_neon_pass1| | 276 ENDP ; |vpx_idct16x16_256_add_neon_pass1| |
277 | 277 |
278 ;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, | 278 ;void vpx_idct16x16_256_add_neon_pass2(int16_t *src, |
279 ; int16_t *output, | 279 ; int16_t *output, |
280 ; int16_t *pass1Output, | 280 ; int16_t *pass1Output, |
281 ; int16_t skip_adding, | 281 ; int16_t skip_adding, |
282 ; uint8_t *dest, | 282 ; uint8_t *dest, |
283 ; int dest_stride) | 283 ; int dest_stride) |
284 ; | 284 ; |
285 ; r0 int16_t *src | 285 ; r0 int16_t *src |
286 ; r1 int16_t *output, | 286 ; r1 int16_t *output, |
287 ; r2 int16_t *pass1Output, | 287 ; r2 int16_t *pass1Output, |
288 ; r3 int16_t skip_adding, | 288 ; r3 int16_t skip_adding, |
289 ; r4 uint8_t *dest, | 289 ; r4 uint8_t *dest, |
290 ; r5 int dest_stride) | 290 ; r5 int dest_stride) |
291 | 291 |
292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output | 292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output |
293 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 293 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
294 ; registers and use them as buffer during calculation. | 294 ; registers and use them as buffer during calculation. |
295 |vp9_idct16x16_256_add_neon_pass2| PROC | 295 |vpx_idct16x16_256_add_neon_pass2| PROC |
296 push {r3-r9} | 296 push {r3-r9} |
297 | 297 |
298 ; TODO(hkuang): Find a better way to load the elements. | 298 ; TODO(hkuang): Find a better way to load the elements. |
299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 | 299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 |
300 vld2.s16 {q8,q9}, [r0]! | 300 vld2.s16 {q8,q9}, [r0]! |
301 vld2.s16 {q9,q10}, [r0]! | 301 vld2.s16 {q9,q10}, [r0]! |
302 vld2.s16 {q10,q11}, [r0]! | 302 vld2.s16 {q10,q11}, [r0]! |
303 vld2.s16 {q11,q12}, [r0]! | 303 vld2.s16 {q11,q12}, [r0]! |
304 vld2.s16 {q12,q13}, [r0]! | 304 vld2.s16 {q12,q13}, [r0]! |
305 vld2.s16 {q13,q14}, [r0]! | 305 vld2.s16 {q13,q14}, [r0]! |
(...skipping 471 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
777 vst1.64 {d9}, [r1], r5 | 777 vst1.64 {d9}, [r1], r5 |
778 vst1.64 {d10}, [r1], r3 | 778 vst1.64 {d10}, [r1], r3 |
779 vst1.64 {d11}, [r1], r5 | 779 vst1.64 {d11}, [r1], r5 |
780 vst1.64 {d28}, [r1], r3 | 780 vst1.64 {d28}, [r1], r3 |
781 vst1.64 {d29}, [r1], r5 | 781 vst1.64 {d29}, [r1], r5 |
782 vst1.64 {d30}, [r1], r3 | 782 vst1.64 {d30}, [r1], r3 |
783 vst1.64 {d31}, [r1], r5 | 783 vst1.64 {d31}, [r1], r5 |
784 end_idct16x16_pass2 | 784 end_idct16x16_pass2 |
785 pop {r3-r9} | 785 pop {r3-r9} |
786 bx lr | 786 bx lr |
787 ENDP ; |vp9_idct16x16_256_add_neon_pass2| | 787 ENDP ; |vpx_idct16x16_256_add_neon_pass2| |
788 | 788 |
789 ;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, | 789 ;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input, |
790 ; int16_t *output, int output_stride
) | 790 ; int16_t *output, int output_stride
) |
791 ; | 791 ; |
792 ; r0 int16_t input | 792 ; r0 int16_t input |
793 ; r1 int16_t *output | 793 ; r1 int16_t *output |
794 ; r2 int output_stride) | 794 ; r2 int output_stride) |
795 | 795 |
796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output | 796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output |
797 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 797 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
798 ; registers and use them as buffer during calculation. | 798 ; registers and use them as buffer during calculation. |
799 |vp9_idct16x16_10_add_neon_pass1| PROC | 799 |vpx_idct16x16_10_add_neon_pass1| PROC |
800 | 800 |
801 ; TODO(hkuang): Find a better way to load the elements. | 801 ; TODO(hkuang): Find a better way to load the elements. |
802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 | 802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 |
803 vld2.s16 {q8,q9}, [r0]! | 803 vld2.s16 {q8,q9}, [r0]! |
804 vld2.s16 {q9,q10}, [r0]! | 804 vld2.s16 {q9,q10}, [r0]! |
805 vld2.s16 {q10,q11}, [r0]! | 805 vld2.s16 {q10,q11}, [r0]! |
806 vld2.s16 {q11,q12}, [r0]! | 806 vld2.s16 {q11,q12}, [r0]! |
807 vld2.s16 {q12,q13}, [r0]! | 807 vld2.s16 {q12,q13}, [r0]! |
808 vld2.s16 {q13,q14}, [r0]! | 808 vld2.s16 {q13,q14}, [r0]! |
809 vld2.s16 {q14,q15}, [r0]! | 809 vld2.s16 {q14,q15}, [r0]! |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
898 vst1.64 {d24}, [r1], r2 | 898 vst1.64 {d24}, [r1], r2 |
899 vst1.64 {d25}, [r1], r2 | 899 vst1.64 {d25}, [r1], r2 |
900 vst1.64 {d26}, [r1], r2 | 900 vst1.64 {d26}, [r1], r2 |
901 vst1.64 {d27}, [r1], r2 | 901 vst1.64 {d27}, [r1], r2 |
902 vst1.64 {d28}, [r1], r2 | 902 vst1.64 {d28}, [r1], r2 |
903 vst1.64 {d29}, [r1], r2 | 903 vst1.64 {d29}, [r1], r2 |
904 vst1.64 {d30}, [r1], r2 | 904 vst1.64 {d30}, [r1], r2 |
905 vst1.64 {d31}, [r1], r2 | 905 vst1.64 {d31}, [r1], r2 |
906 | 906 |
907 bx lr | 907 bx lr |
908 ENDP ; |vp9_idct16x16_10_add_neon_pass1| | 908 ENDP ; |vpx_idct16x16_10_add_neon_pass1| |
909 | 909 |
910 ;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, | 910 ;void vpx_idct16x16_10_add_neon_pass2(int16_t *src, |
911 ; int16_t *output, | 911 ; int16_t *output, |
912 ; int16_t *pass1Output, | 912 ; int16_t *pass1Output, |
913 ; int16_t skip_adding, | 913 ; int16_t skip_adding, |
914 ; uint8_t *dest, | 914 ; uint8_t *dest, |
915 ; int dest_stride) | 915 ; int dest_stride) |
916 ; | 916 ; |
917 ; r0 int16_t *src | 917 ; r0 int16_t *src |
918 ; r1 int16_t *output, | 918 ; r1 int16_t *output, |
919 ; r2 int16_t *pass1Output, | 919 ; r2 int16_t *pass1Output, |
920 ; r3 int16_t skip_adding, | 920 ; r3 int16_t skip_adding, |
921 ; r4 uint8_t *dest, | 921 ; r4 uint8_t *dest, |
922 ; r5 int dest_stride) | 922 ; r5 int dest_stride) |
923 | 923 |
924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output | 924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output |
925 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 925 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
926 ; registers and use them as buffer during calculation. | 926 ; registers and use them as buffer during calculation. |
927 |vp9_idct16x16_10_add_neon_pass2| PROC | 927 |vpx_idct16x16_10_add_neon_pass2| PROC |
928 push {r3-r9} | 928 push {r3-r9} |
929 | 929 |
930 ; TODO(hkuang): Find a better way to load the elements. | 930 ; TODO(hkuang): Find a better way to load the elements. |
931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 | 931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 |
932 vld2.s16 {q8,q9}, [r0]! | 932 vld2.s16 {q8,q9}, [r0]! |
933 vld2.s16 {q9,q10}, [r0]! | 933 vld2.s16 {q9,q10}, [r0]! |
934 vld2.s16 {q10,q11}, [r0]! | 934 vld2.s16 {q10,q11}, [r0]! |
935 vld2.s16 {q11,q12}, [r0]! | 935 vld2.s16 {q11,q12}, [r0]! |
936 vld2.s16 {q12,q13}, [r0]! | 936 vld2.s16 {q12,q13}, [r0]! |
937 vld2.s16 {q13,q14}, [r0]! | 937 vld2.s16 {q13,q14}, [r0]! |
(...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1168 vst1.64 {d9}, [r1], r5 | 1168 vst1.64 {d9}, [r1], r5 |
1169 vst1.64 {d10}, [r1], r3 | 1169 vst1.64 {d10}, [r1], r3 |
1170 vst1.64 {d11}, [r1], r5 | 1170 vst1.64 {d11}, [r1], r5 |
1171 vst1.64 {d28}, [r1], r3 | 1171 vst1.64 {d28}, [r1], r3 |
1172 vst1.64 {d29}, [r1], r5 | 1172 vst1.64 {d29}, [r1], r5 |
1173 vst1.64 {d30}, [r1], r3 | 1173 vst1.64 {d30}, [r1], r3 |
1174 vst1.64 {d31}, [r1], r5 | 1174 vst1.64 {d31}, [r1], r5 |
1175 end_idct10_16x16_pass2 | 1175 end_idct10_16x16_pass2 |
1176 pop {r3-r9} | 1176 pop {r3-r9} |
1177 bx lr | 1177 bx lr |
1178 ENDP ; |vp9_idct16x16_10_add_neon_pass2| | 1178 ENDP ; |vpx_idct16x16_10_add_neon_pass2| |
1179 END | 1179 END |
OLD | NEW |