| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 EXPORT |vp9_idct16x16_256_add_neon_pass1| | 11 EXPORT |vpx_idct16x16_256_add_neon_pass1| |
| 12 EXPORT |vp9_idct16x16_256_add_neon_pass2| | 12 EXPORT |vpx_idct16x16_256_add_neon_pass2| |
| 13 EXPORT |vp9_idct16x16_10_add_neon_pass1| | 13 EXPORT |vpx_idct16x16_10_add_neon_pass1| |
| 14 EXPORT |vp9_idct16x16_10_add_neon_pass2| | 14 EXPORT |vpx_idct16x16_10_add_neon_pass2| |
| 15 ARM | 15 ARM |
| 16 REQUIRE8 | 16 REQUIRE8 |
| 17 PRESERVE8 | 17 PRESERVE8 |
| 18 | 18 |
| 19 AREA ||.text||, CODE, READONLY, ALIGN=2 | 19 AREA ||.text||, CODE, READONLY, ALIGN=2 |
| 20 | 20 |
| 21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. | 21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. |
| 22 MACRO | 22 MACRO |
| 23 TRANSPOSE8X8 | 23 TRANSPOSE8X8 |
| 24 vswp d17, d24 | 24 vswp d17, d24 |
| 25 vswp d23, d30 | 25 vswp d23, d30 |
| 26 vswp d21, d28 | 26 vswp d21, d28 |
| 27 vswp d19, d26 | 27 vswp d19, d26 |
| 28 vtrn.32 q8, q10 | 28 vtrn.32 q8, q10 |
| 29 vtrn.32 q9, q11 | 29 vtrn.32 q9, q11 |
| 30 vtrn.32 q12, q14 | 30 vtrn.32 q12, q14 |
| 31 vtrn.32 q13, q15 | 31 vtrn.32 q13, q15 |
| 32 vtrn.16 q8, q9 | 32 vtrn.16 q8, q9 |
| 33 vtrn.16 q10, q11 | 33 vtrn.16 q10, q11 |
| 34 vtrn.16 q12, q13 | 34 vtrn.16 q12, q13 |
| 35 vtrn.16 q14, q15 | 35 vtrn.16 q14, q15 |
| 36 MEND | 36 MEND |
| 37 | 37 |
| 38 AREA Block, CODE, READONLY ; name this block of code | 38 AREA Block, CODE, READONLY ; name this block of code |
| 39 ;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, | 39 ;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input, |
| 40 ; int16_t *output, int output_stride) | 40 ; int16_t *output, int output_stride) |
| 41 ; | 41 ; |
| 42 ; r0 int16_t input | 42 ; r0 int16_t input |
| 43 ; r1 int16_t *output | 43 ; r1 int16_t *output |
| 44 ; r2 int output_stride) | 44 ; r2 int output_stride) |
| 45 | 45 |
| 46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output | 46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output |
| 47 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 47 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
| 48 ; registers and use them as buffer during calculation. | 48 ; registers and use them as buffer during calculation. |
| 49 |vp9_idct16x16_256_add_neon_pass1| PROC | 49 |vpx_idct16x16_256_add_neon_pass1| PROC |
| 50 | 50 |
| 51 ; TODO(hkuang): Find a better way to load the elements. | 51 ; TODO(hkuang): Find a better way to load the elements. |
| 52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 | 52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 |
| 53 vld2.s16 {q8,q9}, [r0]! | 53 vld2.s16 {q8,q9}, [r0]! |
| 54 vld2.s16 {q9,q10}, [r0]! | 54 vld2.s16 {q9,q10}, [r0]! |
| 55 vld2.s16 {q10,q11}, [r0]! | 55 vld2.s16 {q10,q11}, [r0]! |
| 56 vld2.s16 {q11,q12}, [r0]! | 56 vld2.s16 {q11,q12}, [r0]! |
| 57 vld2.s16 {q12,q13}, [r0]! | 57 vld2.s16 {q12,q13}, [r0]! |
| 58 vld2.s16 {q13,q14}, [r0]! | 58 vld2.s16 {q13,q14}, [r0]! |
| 59 vld2.s16 {q14,q15}, [r0]! | 59 vld2.s16 {q14,q15}, [r0]! |
| (...skipping 206 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 266 vst1.64 {d24}, [r1], r2 | 266 vst1.64 {d24}, [r1], r2 |
| 267 vst1.64 {d25}, [r1], r2 | 267 vst1.64 {d25}, [r1], r2 |
| 268 vst1.64 {d26}, [r1], r2 | 268 vst1.64 {d26}, [r1], r2 |
| 269 vst1.64 {d27}, [r1], r2 | 269 vst1.64 {d27}, [r1], r2 |
| 270 vst1.64 {d28}, [r1], r2 | 270 vst1.64 {d28}, [r1], r2 |
| 271 vst1.64 {d29}, [r1], r2 | 271 vst1.64 {d29}, [r1], r2 |
| 272 vst1.64 {d30}, [r1], r2 | 272 vst1.64 {d30}, [r1], r2 |
| 273 vst1.64 {d31}, [r1], r2 | 273 vst1.64 {d31}, [r1], r2 |
| 274 | 274 |
| 275 bx lr | 275 bx lr |
| 276 ENDP ; |vp9_idct16x16_256_add_neon_pass1| | 276 ENDP ; |vpx_idct16x16_256_add_neon_pass1| |
| 277 | 277 |
| 278 ;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, | 278 ;void vpx_idct16x16_256_add_neon_pass2(int16_t *src, |
| 279 ; int16_t *output, | 279 ; int16_t *output, |
| 280 ; int16_t *pass1Output, | 280 ; int16_t *pass1Output, |
| 281 ; int16_t skip_adding, | 281 ; int16_t skip_adding, |
| 282 ; uint8_t *dest, | 282 ; uint8_t *dest, |
| 283 ; int dest_stride) | 283 ; int dest_stride) |
| 284 ; | 284 ; |
| 285 ; r0 int16_t *src | 285 ; r0 int16_t *src |
| 286 ; r1 int16_t *output, | 286 ; r1 int16_t *output, |
| 287 ; r2 int16_t *pass1Output, | 287 ; r2 int16_t *pass1Output, |
| 288 ; r3 int16_t skip_adding, | 288 ; r3 int16_t skip_adding, |
| 289 ; r4 uint8_t *dest, | 289 ; r4 uint8_t *dest, |
| 290 ; r5 int dest_stride) | 290 ; r5 int dest_stride) |
| 291 | 291 |
| 292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output | 292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output |
| 293 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 293 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
| 294 ; registers and use them as buffer during calculation. | 294 ; registers and use them as buffer during calculation. |
| 295 |vp9_idct16x16_256_add_neon_pass2| PROC | 295 |vpx_idct16x16_256_add_neon_pass2| PROC |
| 296 push {r3-r9} | 296 push {r3-r9} |
| 297 | 297 |
| 298 ; TODO(hkuang): Find a better way to load the elements. | 298 ; TODO(hkuang): Find a better way to load the elements. |
| 299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 | 299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 |
| 300 vld2.s16 {q8,q9}, [r0]! | 300 vld2.s16 {q8,q9}, [r0]! |
| 301 vld2.s16 {q9,q10}, [r0]! | 301 vld2.s16 {q9,q10}, [r0]! |
| 302 vld2.s16 {q10,q11}, [r0]! | 302 vld2.s16 {q10,q11}, [r0]! |
| 303 vld2.s16 {q11,q12}, [r0]! | 303 vld2.s16 {q11,q12}, [r0]! |
| 304 vld2.s16 {q12,q13}, [r0]! | 304 vld2.s16 {q12,q13}, [r0]! |
| 305 vld2.s16 {q13,q14}, [r0]! | 305 vld2.s16 {q13,q14}, [r0]! |
| (...skipping 471 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 777 vst1.64 {d9}, [r1], r5 | 777 vst1.64 {d9}, [r1], r5 |
| 778 vst1.64 {d10}, [r1], r3 | 778 vst1.64 {d10}, [r1], r3 |
| 779 vst1.64 {d11}, [r1], r5 | 779 vst1.64 {d11}, [r1], r5 |
| 780 vst1.64 {d28}, [r1], r3 | 780 vst1.64 {d28}, [r1], r3 |
| 781 vst1.64 {d29}, [r1], r5 | 781 vst1.64 {d29}, [r1], r5 |
| 782 vst1.64 {d30}, [r1], r3 | 782 vst1.64 {d30}, [r1], r3 |
| 783 vst1.64 {d31}, [r1], r5 | 783 vst1.64 {d31}, [r1], r5 |
| 784 end_idct16x16_pass2 | 784 end_idct16x16_pass2 |
| 785 pop {r3-r9} | 785 pop {r3-r9} |
| 786 bx lr | 786 bx lr |
| 787 ENDP ; |vp9_idct16x16_256_add_neon_pass2| | 787 ENDP ; |vpx_idct16x16_256_add_neon_pass2| |
| 788 | 788 |
| 789 ;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, | 789 ;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input, |
| 790 ; int16_t *output, int output_stride
) | 790 ; int16_t *output, int output_stride
) |
| 791 ; | 791 ; |
| 792 ; r0 int16_t input | 792 ; r0 int16_t input |
| 793 ; r1 int16_t *output | 793 ; r1 int16_t *output |
| 794 ; r2 int output_stride) | 794 ; r2 int output_stride) |
| 795 | 795 |
| 796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output | 796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output |
| 797 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 797 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
| 798 ; registers and use them as buffer during calculation. | 798 ; registers and use them as buffer during calculation. |
| 799 |vp9_idct16x16_10_add_neon_pass1| PROC | 799 |vpx_idct16x16_10_add_neon_pass1| PROC |
| 800 | 800 |
| 801 ; TODO(hkuang): Find a better way to load the elements. | 801 ; TODO(hkuang): Find a better way to load the elements. |
| 802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 | 802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 |
| 803 vld2.s16 {q8,q9}, [r0]! | 803 vld2.s16 {q8,q9}, [r0]! |
| 804 vld2.s16 {q9,q10}, [r0]! | 804 vld2.s16 {q9,q10}, [r0]! |
| 805 vld2.s16 {q10,q11}, [r0]! | 805 vld2.s16 {q10,q11}, [r0]! |
| 806 vld2.s16 {q11,q12}, [r0]! | 806 vld2.s16 {q11,q12}, [r0]! |
| 807 vld2.s16 {q12,q13}, [r0]! | 807 vld2.s16 {q12,q13}, [r0]! |
| 808 vld2.s16 {q13,q14}, [r0]! | 808 vld2.s16 {q13,q14}, [r0]! |
| 809 vld2.s16 {q14,q15}, [r0]! | 809 vld2.s16 {q14,q15}, [r0]! |
| (...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 898 vst1.64 {d24}, [r1], r2 | 898 vst1.64 {d24}, [r1], r2 |
| 899 vst1.64 {d25}, [r1], r2 | 899 vst1.64 {d25}, [r1], r2 |
| 900 vst1.64 {d26}, [r1], r2 | 900 vst1.64 {d26}, [r1], r2 |
| 901 vst1.64 {d27}, [r1], r2 | 901 vst1.64 {d27}, [r1], r2 |
| 902 vst1.64 {d28}, [r1], r2 | 902 vst1.64 {d28}, [r1], r2 |
| 903 vst1.64 {d29}, [r1], r2 | 903 vst1.64 {d29}, [r1], r2 |
| 904 vst1.64 {d30}, [r1], r2 | 904 vst1.64 {d30}, [r1], r2 |
| 905 vst1.64 {d31}, [r1], r2 | 905 vst1.64 {d31}, [r1], r2 |
| 906 | 906 |
| 907 bx lr | 907 bx lr |
| 908 ENDP ; |vp9_idct16x16_10_add_neon_pass1| | 908 ENDP ; |vpx_idct16x16_10_add_neon_pass1| |
| 909 | 909 |
| 910 ;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, | 910 ;void vpx_idct16x16_10_add_neon_pass2(int16_t *src, |
| 911 ; int16_t *output, | 911 ; int16_t *output, |
| 912 ; int16_t *pass1Output, | 912 ; int16_t *pass1Output, |
| 913 ; int16_t skip_adding, | 913 ; int16_t skip_adding, |
| 914 ; uint8_t *dest, | 914 ; uint8_t *dest, |
| 915 ; int dest_stride) | 915 ; int dest_stride) |
| 916 ; | 916 ; |
| 917 ; r0 int16_t *src | 917 ; r0 int16_t *src |
| 918 ; r1 int16_t *output, | 918 ; r1 int16_t *output, |
| 919 ; r2 int16_t *pass1Output, | 919 ; r2 int16_t *pass1Output, |
| 920 ; r3 int16_t skip_adding, | 920 ; r3 int16_t skip_adding, |
| 921 ; r4 uint8_t *dest, | 921 ; r4 uint8_t *dest, |
| 922 ; r5 int dest_stride) | 922 ; r5 int dest_stride) |
| 923 | 923 |
| 924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output | 924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output |
| 925 ; will be stored back into q8-q15 registers. This function will touch q0-q7 | 925 ; will be stored back into q8-q15 registers. This function will touch q0-q7 |
| 926 ; registers and use them as buffer during calculation. | 926 ; registers and use them as buffer during calculation. |
| 927 |vp9_idct16x16_10_add_neon_pass2| PROC | 927 |vpx_idct16x16_10_add_neon_pass2| PROC |
| 928 push {r3-r9} | 928 push {r3-r9} |
| 929 | 929 |
| 930 ; TODO(hkuang): Find a better way to load the elements. | 930 ; TODO(hkuang): Find a better way to load the elements. |
| 931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 | 931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 |
| 932 vld2.s16 {q8,q9}, [r0]! | 932 vld2.s16 {q8,q9}, [r0]! |
| 933 vld2.s16 {q9,q10}, [r0]! | 933 vld2.s16 {q9,q10}, [r0]! |
| 934 vld2.s16 {q10,q11}, [r0]! | 934 vld2.s16 {q10,q11}, [r0]! |
| 935 vld2.s16 {q11,q12}, [r0]! | 935 vld2.s16 {q11,q12}, [r0]! |
| 936 vld2.s16 {q12,q13}, [r0]! | 936 vld2.s16 {q12,q13}, [r0]! |
| 937 vld2.s16 {q13,q14}, [r0]! | 937 vld2.s16 {q13,q14}, [r0]! |
| (...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1168 vst1.64 {d9}, [r1], r5 | 1168 vst1.64 {d9}, [r1], r5 |
| 1169 vst1.64 {d10}, [r1], r3 | 1169 vst1.64 {d10}, [r1], r3 |
| 1170 vst1.64 {d11}, [r1], r5 | 1170 vst1.64 {d11}, [r1], r5 |
| 1171 vst1.64 {d28}, [r1], r3 | 1171 vst1.64 {d28}, [r1], r3 |
| 1172 vst1.64 {d29}, [r1], r5 | 1172 vst1.64 {d29}, [r1], r5 |
| 1173 vst1.64 {d30}, [r1], r3 | 1173 vst1.64 {d30}, [r1], r3 |
| 1174 vst1.64 {d31}, [r1], r5 | 1174 vst1.64 {d31}, [r1], r5 |
| 1175 end_idct10_16x16_pass2 | 1175 end_idct10_16x16_pass2 |
| 1176 pop {r3-r9} | 1176 pop {r3-r9} |
| 1177 bx lr | 1177 bx lr |
| 1178 ENDP ; |vp9_idct16x16_10_add_neon_pass2| | 1178 ENDP ; |vpx_idct16x16_10_add_neon_pass2| |
| 1179 END | 1179 END |
| OLD | NEW |