Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(172)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
11 @// to support float instead of SC32.
12 @//
13
14 @//
15 @// Description:
16 @// Compute a Radix 4 FFT stage for a N point complex signal
17 @//
18 @//
19
20
21 @// Include standard headers
22
23 #include "dl/api/armCOMM_s.h"
24 #include "dl/api/omxtypes_s.h"
25
26 @// Import symbols required from other files
27 @// (For example tables)
28
29
30
31
32 @// Set debugging level
33 @//DEBUG_ON SETL {TRUE}
34
35
36 @// Guarding implementation by the processor name
37
38
39 @// Import symbols required from other files
40 @// (For example tables)
41 @//IMPORT armAAC_constTable
42
43 @//Input Registers
44
45 #define pSrc r0
46 #define pDst r2
47 #define pTwiddle r1
48 #define subFFTNum r6
49 #define subFFTSize r7
50
51
52
53 @//Output Registers
54
55
56 @//Local Scratch Registers
57
58 #define outPointStep r3
59 #define grpCount r4
60 #define dstStep r5
61 #define grpTwStep r8
62 #define stepTwiddle r9
63 #define twStep r10
64 #define pTmp r4
65 #define step16 r11
66 #define step24 r12
67
68
69 @// Neon Registers
70
71 #define dButterfly1Real02 D0.F32
72 #define dButterfly1Imag02 D1.F32
73 #define dButterfly1Real13 D2.F32
74 #define dButterfly1Imag13 D3.F32
75 #define dButterfly2Real02 D4.F32
76 #define dButterfly2Imag02 D5.F32
77 #define dButterfly2Real13 D6.F32
78 #define dButterfly2Imag13 D7.F32
79 #define dXr0 D0.F32
80 #define dXi0 D1.F32
81 #define dXr1 D2.F32
82 #define dXi1 D3.F32
83 #define dXr2 D4.F32
84 #define dXi2 D5.F32
85 #define dXr3 D6.F32
86 #define dXi3 D7.F32
87
88 #define dYr0 D16.F32
89 #define dYi0 D17.F32
90 #define dYr1 D18.F32
91 #define dYi1 D19.F32
92 #define dYr2 D20.F32
93 #define dYi2 D21.F32
94 #define dYr3 D22.F32
95 #define dYi3 D23.F32
96
97 #define dW1r D8.F32
98 #define dW1i D9.F32
99 #define dW2r D10.F32
100 #define dW2i D11.F32
101 #define dW3r D12.F32
102 #define dW3i D13.F32
103 #define qT0 d14.f32
104 #define qT1 d16.F32
105 #define qT2 d18.F32
106 #define qT3 d20.f32
107 #define qT4 d22.f32
108 #define qT5 d24.f32
109
110 #define dZr0 D14.F32
111 #define dZi0 D15.F32
112 #define dZr1 D26.F32
113 #define dZi1 D27.F32
114 #define dZr2 D28.F32
115 #define dZi2 D29.F32
116 #define dZr3 D30.F32
117 #define dZi3 D31.F32
118
119 #define qX0 Q0.F32
120 #define qY0 Q8.F32
121 #define qY1 Q9.F32
122 #define qY2 Q10.F32
123 #define qY3 Q11.F32
124 #define qZ0 Q7.F32
125 #define qZ1 Q13.F32
126 #define qZ2 Q14.F32
127 #define qZ3 Q15.F32
128
129
130
131 .MACRO FFTSTAGE scaled, inverse , name
132
133 @// Define stack arguments
134
135
136 @// pOut0+1 increments pOut0 by 8 bytes
137 @// pOut0+outPointStep == increment of 8*outPointStep bytes
138 MOV outPointStep,subFFTSize,LSL #3
139
140 @// Update grpCount and grpSize rightaway
141
142 VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr]
143 MOV step16,#16
144 LSL grpCount,subFFTSize,#2
145
146 VLD1 dW2r,[pTwiddle :64] @// [wi|wr]
147 MOV subFFTNum,#1 @//after the last stage
148
149 VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr]
150 MOV stepTwiddle,#0
151
152 VLD1 dW2i,[pTwiddle :64]! @// [wi|wr]
153 SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to st art with
154
155 @// update subFFTSize for the next stage
156 MOV subFFTSize,grpCount
157 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
158 MOV dstStep,outPointStep,LSL #1
159
160 @// AC.r AC.i BD.r BD.i
161 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterf ly1Imag13},[pSrc :256]!
162 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPoint Step
163 RSB dstStep,dstStep,#16 @// dstStep = - 3*outPoi ntStep+16
164 MOV step24,#24
165
166 @// AC.r AC.i BD.r BD.i
167 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterf ly2Imag13},[pSrc :256]!
168
169
170 @// Process two groups at a time
171
172 radix4lsGrpLoop\name :
173
174 VZIP dW2r,dW2i
175 ADD stepTwiddle,stepTwiddle,#16
176 VZIP dW3r,dW3i
177 ADD grpTwStep,stepTwiddle,#4
178 VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
179 SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle
180 VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
181 MOV grpTwStep,grpTwStep,LSL #1
182 VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r
183 RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle
184
185
186 VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
187
188
189 @// grpCount is multiplied by 4
190 SUBS grpCount,grpCount,#8
191
192 .ifeqs "\inverse", "TRUE"
193 VMUL dZr1,dW1r,dXr1
194 VMLA dZr1,dW1i,dXi1 @// real part
195 VMUL dZi1,dW1r,dXi1
196 VMLS dZi1,dW1i,dXr1 @// imag part
197
198 .else
199
200 VMUL dZr1,dW1r,dXr1
201 VMLS dZr1,dW1i,dXi1 @// real part
202 VMUL dZi1,dW1r,dXi1
203 VMLA dZi1,dW1i,dXr1 @// imag part
204
205 .endif
206
207 VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr]
208
209 .ifeqs "\inverse", "TRUE"
210 VMUL dZr2,dW2r,dXr2
211 VMLA dZr2,dW2i,dXi2 @// real part
212 VMUL dZi2,dW2r,dXi2
213 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
214 VMLS dZi2,dW2i,dXr2 @// imag part
215
216 .else
217
218 VMUL dZr2,dW2r,dXr2
219 VMLS dZr2,dW2i,dXi2 @// real part
220 VMUL dZi2,dW2r,dXi2
221 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr]
222 VMLA dZi2,dW2i,dXr2 @// imag part
223
224 .endif
225
226
227 VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr]
228
229 @// move qX0 so as to load for the next iteration
230 VMOV qZ0,qX0
231
232 .ifeqs "\inverse", "TRUE"
233 VMUL dZr3,dW3r,dXr3
234 VMLA dZr3,dW3i,dXi3 @// real part
235 VMUL dZi3,dW3r,dXi3
236 VLD1 dW3r,[pTwiddle :64],step24
237 VMLS dZi3,dW3i,dXr3 @// imag part
238
239 .else
240
241 VMUL dZr3,dW3r,dXr3
242 VMLS dZr3,dW3i,dXi3 @// real part
243 VMUL dZi3,dW3r,dXi3
244 VLD1 dW3r,[pTwiddle :64],step24
245 VMLA dZi3,dW3i,dXr3 @// imag part
246
247 .endif
248
249 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr]
250
251 @// Don't do the load on the last iteration so we don't read past the en d
252 @// of pSrc.
253 addeq pSrc, pSrc, #64
254 beq radix4lsSkipRead\name
255 @// AC.r AC.i BD.r BD.i
256 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterf ly1Imag13},[pSrc :256]!
257
258 @// AC.r AC.i BD.r BD.i
259 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterf ly2Imag13},[pSrc :256]!
260 radix4lsSkipRead\name:
261
262 @// finish first stage of 4 point FFT
263
264 VADD qY0,qZ0,qZ2
265 VSUB qY2,qZ0,qZ2
266 VADD qY1,qZ1,qZ3
267 VSUB qY3,qZ1,qZ3
268
269
270 @// finish second stage of 4 point FFT
271
272 .ifeqs "\inverse", "TRUE"
273
274 VSUB qZ0,qY2,qY1
275
276 VADD dZr3,dYr0,dYi3
277 VST2 {dZr0,dZi0},[pDst :128],outPointStep
278 VSUB dZi3,dYi0,dYr3
279
280 VADD qZ2,qY2,qY1
281 VST2 {dZr3,dZi3},[pDst :128],outPointStep
282
283 VSUB dZr1,dYr0,dYi3
284 VST2 {dZr2,dZi2},[pDst :128],outPointStep
285 VADD dZi1,dYi0,dYr3
286
287 @// dstStep = -outPointStep + 16
288 VST2 {dZr1,dZi1},[pDst :128],dstStep
289
290
291 .else
292
293 VSUB qZ0,qY2,qY1
294
295 VSUB dZr1,dYr0,dYi3
296 VST2 {dZr0,dZi0},[pDst :128],outPointStep
297 VADD dZi1,dYi0,dYr3
298
299 VADD qZ2,qY2,qY1
300 VST2 {dZr1,dZi1},[pDst :128],outPointStep
301
302 VADD dZr3,dYr0,dYi3
303 VST2 {dZr2,dZi2},[pDst :128],outPointStep
304 VSUB dZi3,dYi0,dYr3
305
306 @// dstStep = -outPointStep + 16
307 VST2 {dZr3,dZi3},[pDst :128],dstStep
308
309
310 .endif
311
312 BGT radix4lsGrpLoop\name
313
314
315 @// Reset and Swap pSrc and pDst for the next stage
316 MOV pTmp,pDst
317 @// Extra increment done in final iteration of the loop
318 SUB pSrc,pSrc,#64
319 @// pDst -= 4*size; pSrc -= 8*size bytes
320 SUB pDst,pSrc,outPointStep,LSL #2
321 SUB pSrc,pTmp,outPointStep
322 SUB pTwiddle,pTwiddle,subFFTSize,LSL #1
323 @// Extra increment done in final iteration of the loop
324 SUB pTwiddle,pTwiddle,#16
325
326 .endm
327
328
329 M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
330 FFTSTAGE "FALSE","FALSE",fwd
331 M_END
332
333
334 M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
335 FFTSTAGE "FALSE","TRUE",inv
336 M_END
337
338
339 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698