Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(41)

Side by Side Diff: source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 EXPORT |vp8_short_fdct4x4_neon|
13 EXPORT |vp8_short_fdct8x4_neon|
14
15 ARM
16 REQUIRE8
17 PRESERVE8
18
19 AREA ||.text||, CODE, READONLY, ALIGN=4
20
21
22 ALIGN 16 ; enable use of @128 bit aligned loads
23 coeff
24 DCW 5352, 5352, 5352, 5352
25 DCW 2217, 2217, 2217, 2217
26 DCD 14500, 14500, 14500, 14500
27 DCD 7500, 7500, 7500, 7500
28 DCD 12000, 12000, 12000, 12000
29 DCD 51000, 51000, 51000, 51000
30
31 ;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
32 |vp8_short_fdct4x4_neon| PROC
33
34 ; Part one
35 vld1.16 {d0}, [r0@64], r2
36 adr r12, coeff
37 vld1.16 {d1}, [r0@64], r2
38 vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
39 vld1.16 {d2}, [r0@64], r2
40 vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
41 vld1.16 {d3}, [r0@64], r2
42
43 ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
44 vtrn.32 d0, d2
45 vtrn.32 d1, d3
46 vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
47 vtrn.16 d0, d1
48 vtrn.16 d2, d3
49
50 vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
51 vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
52 vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
53 vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
54
55 vshl.s16 q2, q2, #3 ; (a1, b1) << 3
56 vshl.s16 q3, q3, #3 ; (c1, d1) << 3
57
58 vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
59 vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
60
61 vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
62 vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
63 vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
64 vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
65
66 vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
67 vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
68
69
70 ; Part two
71
72 ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
73 vtrn.32 d0, d2
74 vtrn.32 d1, d3
75 vtrn.16 d0, d1
76 vtrn.16 d2, d3
77
78 vmov.s16 d26, #7
79
80 vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
81 vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
82 vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
83 vadd.s16 d4, d4, d26 ; a1 + 7
84 vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
85
86 vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
87 vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
88
89 vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
90 vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
91
92 vceq.s16 d4, d7, #0
93
94 vshr.s16 d0, d0, #4
95 vshr.s16 d2, d2, #4
96
97 vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
98 vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
99
100 vmvn d4, d4
101 vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
102 vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
103 vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
104
105 vst1.16 {q0, q1}, [r1@128]
106
107 bx lr
108
109 ENDP
110
111 ;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
112 |vp8_short_fdct8x4_neon| PROC
113
114 ; Part one
115
116 vld1.16 {q0}, [r0@128], r2
117 adr r12, coeff
118 vld1.16 {q1}, [r0@128], r2
119 vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
120 vld1.16 {q2}, [r0@128], r2
121 vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
122 vld1.16 {q3}, [r0@128], r2
123
124 ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
125 vtrn.32 q0, q2 ; [A0|B0]
126 vtrn.32 q1, q3 ; [A1|B1]
127 vtrn.16 q0, q1 ; [A2|B2]
128 vtrn.16 q2, q3 ; [A3|B3]
129
130 vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
131 vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
132 vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
133 vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
134
135 vshl.s16 q11, q11, #3 ; a1 << 3
136 vshl.s16 q12, q12, #3 ; b1 << 3
137 vshl.s16 q13, q13, #3 ; c1 << 3
138 vshl.s16 q14, q14, #3 ; d1 << 3
139
140 vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
141 vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
142
143 vmov.s16 q11, q9 ; 14500
144 vmov.s16 q12, q10 ; 7500
145
146 vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
147 vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
148 vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
149 vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
150
151 vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
152 vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
153 vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
154 vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
155
156 vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
157 vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
158 vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
159 vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
160
161
162 ; Part two
163 vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
164
165 ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
166 vtrn.32 q0, q2 ; q0=[A0 | B0]
167 vtrn.32 q1, q3 ; q1=[A4 | B4]
168 vtrn.16 q0, q1 ; q2=[A8 | B8]
169 vtrn.16 q2, q3 ; q3=[A12|B12]
170
171 vmov.s16 q15, #7
172
173 vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
174 vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
175 vadd.s16 q11, q11, q15 ; a1 + 7
176 vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
177 vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
178
179 vadd.s16 q0, q11, q12 ; a1 + b1 + 7
180 vsub.s16 q1, q11, q12 ; a1 - b1 + 7
181
182 vmov.s16 q11, q9 ; 12000
183 vmov.s16 q12, q10 ; 51000
184
185 vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
186 vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
187 vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
188 vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
189
190
191 vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
192 vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
193 vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
194 vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
195
196 vceq.s16 q14, q14, #0
197
198 vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
199 vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
200 vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
201 vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
202
203 vmvn q14, q14
204
205 vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
206 vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
207 vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
208
209 vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
210 vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
211 vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
212
213 vst1.16 {q0, q1}, [r1@128]! ; block A
214 vst1.16 {q2, q3}, [r1@128]! ; block B
215
216 bx lr
217
218 ENDP
219
220 END
221
OLDNEW
« no previous file with comments | « source/libvpx/vp8/encoder/arm/neon/picklpf_arm.c ('k') | source/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698