Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(129)

Side by Side Diff: source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 EXPORT |vp9_idct16x16_256_add_neon_pass1| 11 EXPORT |vpx_idct16x16_256_add_neon_pass1|
12 EXPORT |vp9_idct16x16_256_add_neon_pass2| 12 EXPORT |vpx_idct16x16_256_add_neon_pass2|
13 EXPORT |vp9_idct16x16_10_add_neon_pass1| 13 EXPORT |vpx_idct16x16_10_add_neon_pass1|
14 EXPORT |vp9_idct16x16_10_add_neon_pass2| 14 EXPORT |vpx_idct16x16_10_add_neon_pass2|
15 ARM 15 ARM
16 REQUIRE8 16 REQUIRE8
17 PRESERVE8 17 PRESERVE8
18 18
19 AREA ||.text||, CODE, READONLY, ALIGN=2 19 AREA ||.text||, CODE, READONLY, ALIGN=2
20 20
21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. 21 ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
22 MACRO 22 MACRO
23 TRANSPOSE8X8 23 TRANSPOSE8X8
24 vswp d17, d24 24 vswp d17, d24
25 vswp d23, d30 25 vswp d23, d30
26 vswp d21, d28 26 vswp d21, d28
27 vswp d19, d26 27 vswp d19, d26
28 vtrn.32 q8, q10 28 vtrn.32 q8, q10
29 vtrn.32 q9, q11 29 vtrn.32 q9, q11
30 vtrn.32 q12, q14 30 vtrn.32 q12, q14
31 vtrn.32 q13, q15 31 vtrn.32 q13, q15
32 vtrn.16 q8, q9 32 vtrn.16 q8, q9
33 vtrn.16 q10, q11 33 vtrn.16 q10, q11
34 vtrn.16 q12, q13 34 vtrn.16 q12, q13
35 vtrn.16 q14, q15 35 vtrn.16 q14, q15
36 MEND 36 MEND
37 37
38 AREA Block, CODE, READONLY ; name this block of code 38 AREA Block, CODE, READONLY ; name this block of code
39 ;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, 39 ;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
40 ; int16_t *output, int output_stride) 40 ; int16_t *output, int output_stride)
41 ; 41 ;
42 ; r0 int16_t input 42 ; r0 int16_t input
43 ; r1 int16_t *output 43 ; r1 int16_t *output
44 ; r2 int output_stride) 44 ; r2 int output_stride)
45 45
46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output 46 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
47 ; will be stored back into q8-q15 registers. This function will touch q0-q7 47 ; will be stored back into q8-q15 registers. This function will touch q0-q7
48 ; registers and use them as buffer during calculation. 48 ; registers and use them as buffer during calculation.
49 |vp9_idct16x16_256_add_neon_pass1| PROC 49 |vpx_idct16x16_256_add_neon_pass1| PROC
50 50
51 ; TODO(hkuang): Find a better way to load the elements. 51 ; TODO(hkuang): Find a better way to load the elements.
52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 52 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
53 vld2.s16 {q8,q9}, [r0]! 53 vld2.s16 {q8,q9}, [r0]!
54 vld2.s16 {q9,q10}, [r0]! 54 vld2.s16 {q9,q10}, [r0]!
55 vld2.s16 {q10,q11}, [r0]! 55 vld2.s16 {q10,q11}, [r0]!
56 vld2.s16 {q11,q12}, [r0]! 56 vld2.s16 {q11,q12}, [r0]!
57 vld2.s16 {q12,q13}, [r0]! 57 vld2.s16 {q12,q13}, [r0]!
58 vld2.s16 {q13,q14}, [r0]! 58 vld2.s16 {q13,q14}, [r0]!
59 vld2.s16 {q14,q15}, [r0]! 59 vld2.s16 {q14,q15}, [r0]!
(...skipping 206 matching lines...) Expand 10 before | Expand all | Expand 10 after
266 vst1.64 {d24}, [r1], r2 266 vst1.64 {d24}, [r1], r2
267 vst1.64 {d25}, [r1], r2 267 vst1.64 {d25}, [r1], r2
268 vst1.64 {d26}, [r1], r2 268 vst1.64 {d26}, [r1], r2
269 vst1.64 {d27}, [r1], r2 269 vst1.64 {d27}, [r1], r2
270 vst1.64 {d28}, [r1], r2 270 vst1.64 {d28}, [r1], r2
271 vst1.64 {d29}, [r1], r2 271 vst1.64 {d29}, [r1], r2
272 vst1.64 {d30}, [r1], r2 272 vst1.64 {d30}, [r1], r2
273 vst1.64 {d31}, [r1], r2 273 vst1.64 {d31}, [r1], r2
274 274
275 bx lr 275 bx lr
276 ENDP ; |vp9_idct16x16_256_add_neon_pass1| 276 ENDP ; |vpx_idct16x16_256_add_neon_pass1|
277 277
278 ;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, 278 ;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
279 ; int16_t *output, 279 ; int16_t *output,
280 ; int16_t *pass1Output, 280 ; int16_t *pass1Output,
281 ; int16_t skip_adding, 281 ; int16_t skip_adding,
282 ; uint8_t *dest, 282 ; uint8_t *dest,
283 ; int dest_stride) 283 ; int dest_stride)
284 ; 284 ;
285 ; r0 int16_t *src 285 ; r0 int16_t *src
286 ; r1 int16_t *output, 286 ; r1 int16_t *output,
287 ; r2 int16_t *pass1Output, 287 ; r2 int16_t *pass1Output,
288 ; r3 int16_t skip_adding, 288 ; r3 int16_t skip_adding,
289 ; r4 uint8_t *dest, 289 ; r4 uint8_t *dest,
290 ; r5 int dest_stride) 290 ; r5 int dest_stride)
291 291
292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output 292 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
293 ; will be stored back into q8-q15 registers. This function will touch q0-q7 293 ; will be stored back into q8-q15 registers. This function will touch q0-q7
294 ; registers and use them as buffer during calculation. 294 ; registers and use them as buffer during calculation.
295 |vp9_idct16x16_256_add_neon_pass2| PROC 295 |vpx_idct16x16_256_add_neon_pass2| PROC
296 push {r3-r9} 296 push {r3-r9}
297 297
298 ; TODO(hkuang): Find a better way to load the elements. 298 ; TODO(hkuang): Find a better way to load the elements.
299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 299 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
300 vld2.s16 {q8,q9}, [r0]! 300 vld2.s16 {q8,q9}, [r0]!
301 vld2.s16 {q9,q10}, [r0]! 301 vld2.s16 {q9,q10}, [r0]!
302 vld2.s16 {q10,q11}, [r0]! 302 vld2.s16 {q10,q11}, [r0]!
303 vld2.s16 {q11,q12}, [r0]! 303 vld2.s16 {q11,q12}, [r0]!
304 vld2.s16 {q12,q13}, [r0]! 304 vld2.s16 {q12,q13}, [r0]!
305 vld2.s16 {q13,q14}, [r0]! 305 vld2.s16 {q13,q14}, [r0]!
(...skipping 471 matching lines...) Expand 10 before | Expand all | Expand 10 after
777 vst1.64 {d9}, [r1], r5 777 vst1.64 {d9}, [r1], r5
778 vst1.64 {d10}, [r1], r3 778 vst1.64 {d10}, [r1], r3
779 vst1.64 {d11}, [r1], r5 779 vst1.64 {d11}, [r1], r5
780 vst1.64 {d28}, [r1], r3 780 vst1.64 {d28}, [r1], r3
781 vst1.64 {d29}, [r1], r5 781 vst1.64 {d29}, [r1], r5
782 vst1.64 {d30}, [r1], r3 782 vst1.64 {d30}, [r1], r3
783 vst1.64 {d31}, [r1], r5 783 vst1.64 {d31}, [r1], r5
784 end_idct16x16_pass2 784 end_idct16x16_pass2
785 pop {r3-r9} 785 pop {r3-r9}
786 bx lr 786 bx lr
787 ENDP ; |vp9_idct16x16_256_add_neon_pass2| 787 ENDP ; |vpx_idct16x16_256_add_neon_pass2|
788 788
789 ;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, 789 ;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
790 ; int16_t *output, int output_stride ) 790 ; int16_t *output, int output_stride )
791 ; 791 ;
792 ; r0 int16_t input 792 ; r0 int16_t input
793 ; r1 int16_t *output 793 ; r1 int16_t *output
794 ; r2 int output_stride) 794 ; r2 int output_stride)
795 795
796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output 796 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
797 ; will be stored back into q8-q15 registers. This function will touch q0-q7 797 ; will be stored back into q8-q15 registers. This function will touch q0-q7
798 ; registers and use them as buffer during calculation. 798 ; registers and use them as buffer during calculation.
799 |vp9_idct16x16_10_add_neon_pass1| PROC 799 |vpx_idct16x16_10_add_neon_pass1| PROC
800 800
801 ; TODO(hkuang): Find a better way to load the elements. 801 ; TODO(hkuang): Find a better way to load the elements.
802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 802 ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
803 vld2.s16 {q8,q9}, [r0]! 803 vld2.s16 {q8,q9}, [r0]!
804 vld2.s16 {q9,q10}, [r0]! 804 vld2.s16 {q9,q10}, [r0]!
805 vld2.s16 {q10,q11}, [r0]! 805 vld2.s16 {q10,q11}, [r0]!
806 vld2.s16 {q11,q12}, [r0]! 806 vld2.s16 {q11,q12}, [r0]!
807 vld2.s16 {q12,q13}, [r0]! 807 vld2.s16 {q12,q13}, [r0]!
808 vld2.s16 {q13,q14}, [r0]! 808 vld2.s16 {q13,q14}, [r0]!
809 vld2.s16 {q14,q15}, [r0]! 809 vld2.s16 {q14,q15}, [r0]!
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after
898 vst1.64 {d24}, [r1], r2 898 vst1.64 {d24}, [r1], r2
899 vst1.64 {d25}, [r1], r2 899 vst1.64 {d25}, [r1], r2
900 vst1.64 {d26}, [r1], r2 900 vst1.64 {d26}, [r1], r2
901 vst1.64 {d27}, [r1], r2 901 vst1.64 {d27}, [r1], r2
902 vst1.64 {d28}, [r1], r2 902 vst1.64 {d28}, [r1], r2
903 vst1.64 {d29}, [r1], r2 903 vst1.64 {d29}, [r1], r2
904 vst1.64 {d30}, [r1], r2 904 vst1.64 {d30}, [r1], r2
905 vst1.64 {d31}, [r1], r2 905 vst1.64 {d31}, [r1], r2
906 906
907 bx lr 907 bx lr
908 ENDP ; |vp9_idct16x16_10_add_neon_pass1| 908 ENDP ; |vpx_idct16x16_10_add_neon_pass1|
909 909
910 ;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, 910 ;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
911 ; int16_t *output, 911 ; int16_t *output,
912 ; int16_t *pass1Output, 912 ; int16_t *pass1Output,
913 ; int16_t skip_adding, 913 ; int16_t skip_adding,
914 ; uint8_t *dest, 914 ; uint8_t *dest,
915 ; int dest_stride) 915 ; int dest_stride)
916 ; 916 ;
917 ; r0 int16_t *src 917 ; r0 int16_t *src
918 ; r1 int16_t *output, 918 ; r1 int16_t *output,
919 ; r2 int16_t *pass1Output, 919 ; r2 int16_t *pass1Output,
920 ; r3 int16_t skip_adding, 920 ; r3 int16_t skip_adding,
921 ; r4 uint8_t *dest, 921 ; r4 uint8_t *dest,
922 ; r5 int dest_stride) 922 ; r5 int dest_stride)
923 923
924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output 924 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
925 ; will be stored back into q8-q15 registers. This function will touch q0-q7 925 ; will be stored back into q8-q15 registers. This function will touch q0-q7
926 ; registers and use them as buffer during calculation. 926 ; registers and use them as buffer during calculation.
927 |vp9_idct16x16_10_add_neon_pass2| PROC 927 |vpx_idct16x16_10_add_neon_pass2| PROC
928 push {r3-r9} 928 push {r3-r9}
929 929
930 ; TODO(hkuang): Find a better way to load the elements. 930 ; TODO(hkuang): Find a better way to load the elements.
931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 931 ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
932 vld2.s16 {q8,q9}, [r0]! 932 vld2.s16 {q8,q9}, [r0]!
933 vld2.s16 {q9,q10}, [r0]! 933 vld2.s16 {q9,q10}, [r0]!
934 vld2.s16 {q10,q11}, [r0]! 934 vld2.s16 {q10,q11}, [r0]!
935 vld2.s16 {q11,q12}, [r0]! 935 vld2.s16 {q11,q12}, [r0]!
936 vld2.s16 {q12,q13}, [r0]! 936 vld2.s16 {q12,q13}, [r0]!
937 vld2.s16 {q13,q14}, [r0]! 937 vld2.s16 {q13,q14}, [r0]!
(...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after
1168 vst1.64 {d9}, [r1], r5 1168 vst1.64 {d9}, [r1], r5
1169 vst1.64 {d10}, [r1], r3 1169 vst1.64 {d10}, [r1], r3
1170 vst1.64 {d11}, [r1], r5 1170 vst1.64 {d11}, [r1], r5
1171 vst1.64 {d28}, [r1], r3 1171 vst1.64 {d28}, [r1], r3
1172 vst1.64 {d29}, [r1], r5 1172 vst1.64 {d29}, [r1], r5
1173 vst1.64 {d30}, [r1], r3 1173 vst1.64 {d30}, [r1], r3
1174 vst1.64 {d31}, [r1], r5 1174 vst1.64 {d31}, [r1], r5
1175 end_idct10_16x16_pass2 1175 end_idct10_16x16_pass2
1176 pop {r3-r9} 1176 pop {r3-r9}
1177 bx lr 1177 bx lr
1178 ENDP ; |vp9_idct16x16_10_add_neon_pass2| 1178 ENDP ; |vpx_idct16x16_10_add_neon_pass2|
1179 END 1179 END
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c ('k') | source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698