Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(320)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
11 @// to support float instead of SC32.
12 @//
13
14 @//
15 @// Description:
16 @// Compute FFT for a real signal
17 @//
18 @//
19
20
21 @// Include standard headers
22
23 #include "dl/api/armCOMM_s.h"
24 #include "dl/api/omxtypes_s.h"
25
26
27 @// Import symbols required from other files
28 @// (For example tables)
29
30 .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
31 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
32 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
33 .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
34 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
35 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
36 .extern armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
37
38 @// Set debugging level
39 @//DEBUG_ON SETL {TRUE}
40
41
42
43 @// Guarding implementation by the processor name
44
45
46
47 @// Guarding implementation by the processor name
48
49 @// Import symbols required from other files
50 @// (For example tables)
51 .extern armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
52 .extern armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
53
54
55 @//Input Registers
56
57 #define pSrc r0
58 #define pDst r1
59 #define pFFTSpec r2
60 #define scale r3
61
62
63 @// Output registers
64 #define result r0
65
66 @//Local Scratch Registers
67
68 #define argTwiddle r1
69 #define argDst r2
70 #define argScale r4
71 #define tmpOrder r4
72 #define pTwiddle r4
73 #define pOut r5
74 #define subFFTSize r7
75 #define subFFTNum r6
76 #define N r6
77 #define order r14
78 #define diff r9
79 @// Total num of radix stages required to comple the FFT
80 #define count r8
81 #define x0r r4
82 #define x0i r5
83 #define diffMinusOne r2
84 #define subFFTSizeTmp r6
85 #define step r3
86 #define step1 r4
87 #define twStep r8
88 #define zero r9
89 #define pTwiddleTmp r5
90 #define t0 r10
91
92 @// Neon registers
93
94 #define dX0 d0.f32
95 #define dzero d1.f32
96 #define dZero d2.f32
97 #define dShift d3.f32
98 #define dX0r d2.f32
99 #define dX0i d3.f32
100 #define dX1r d4.f32
101 #define dX1i d5.f32
102 #define dT0 d6.f32
103 #define dT1 d7.f32
104 #define dT2 d8.f32
105 #define dT3 d9.f32
106 #define qT0 d10.f32
107 #define qT1 d12.f32
108 #define dW0r d14.f32
109 #define dW0i d15.f32
110 #define dW1r d16.f32
111 #define dW1i d17.f32
112 #define dY0r d14.f32
113 #define dY0i d15.f32
114 #define dY1r d16.f32
115 #define dY1i d17.f32
116 #define dY0rS64 d14.s64
117 #define dY0iS64 d15.s64
118 #define qT2 d18.f32
119 #define qT3 d20.f32
120 @// lastThreeelements
121 #define dX1 d3.f32
122 #define dW0 d4.f32
123 #define dW1 d5.f32
124 #define dY0 d10.f32
125 #define dY1 d11.f32
126 #define dY2 d12.f32
127 #define dY3 d13.f32
128
129 #define half d0.f32
130
131 HALF: .float 0.5
132
133 @// Allocate stack memory required by the function
134
135 @// Write function header
136 M_START omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15
137
138 @ Structure offsets for the FFTSpec
139 .set ARMsFFTSpec_N, 0
140 .set ARMsFFTSpec_pBitRev, 4
141 .set ARMsFFTSpec_pTwiddle, 8
142 .set ARMsFFTSpec_pBuf, 12
143
144 @// Define stack arguments
145
146 @// Read the size from structure and take log
147 LDR N, [pFFTSpec, #ARMsFFTSpec_N]
148
149 @// Read other structure parameters
150 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
151 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
152
153 @// N=1 Treat seperately
154 CMP N,#1
155 BGT sizeGreaterThanOne
156 VLD1 dX0[0],[pSrc]
157 MOV zero,#0
158 VMOV dzero[0],zero
159 VMOV dZero[0],zero
160 VST3 {dX0[0],dzero[0],dZero[0]},[pDst]
161
162 B End
163
164
165
166 sizeGreaterThanOne:
167 @// Do a N/2 point complex FFT including the scaling
168
169 MOV N,N,ASR #1 @// N/2 point complex FFT
170
171 CLZ order,N @// N = 2^order
172 RSB order,order,#31
173 MOV subFFTSize,#1
174 @//MOV subFFTNum,N
175
176 CMP order,#3
177 BGT orderGreaterthan3 @// order > 3
178
179 CMP order,#1
180 BGE orderGreaterthan0 @// order > 0
181 VLD1 dX0,[pSrc]
182 VST1 dX0,[pOut]
183 MOV pSrc,pOut
184 MOV argDst,pDst
185 BLT FFTEnd
186
187 orderGreaterthan0:
188 @// set the buffers appropriately for various orders
189 CMP order,#2
190 MOVEQ argDst,pDst
191 MOVNE argDst,pOut
192 @// Pass the first stage destination in RN5
193 MOVNE pOut,pDst
194 MOV argTwiddle,pTwiddle
195
196 CMP order,#1
197 BGT orderGreaterthan1
198 @// order = 1
199 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
200 B FFTEnd
201
202 orderGreaterthan1:
203 CMP order,#2
204 BGT orderGreaterthan2
205 @// order =2
206 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
207 BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
208 B FFTEnd
209
210 orderGreaterthan2:@// order =3
211 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
212 BL armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
213 BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
214
215 B FFTEnd
216
217
218
219 orderGreaterthan3:
220 specialScaleCase:
221
222 @// Set input args to fft stages
223 TST order, #2
224 MOVEQ argDst,pDst
225 MOVNE argDst,pOut
226 @// Pass the first stage destination in RN5
227 MOVNE pOut,pDst
228 MOV argTwiddle,pTwiddle
229
230 @//check for even or odd order
231 @// NOTE: The following combination of BL's would work fine even though
232 @// the first BL would corrupt the flags. This is because the end of
233 @// the "grpZeroSetLoop" loop inside
234 @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
235 @// to EQ
236
237 TST order,#0x00000001
238 BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
239 BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
240
241 CMP subFFTNum,#4
242 BLT FFTEnd
243
244
245 unscaledRadix4Loop:
246 BEQ lastStageUnscaledRadix4
247 BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
248 CMP subFFTNum,#4
249 B unscaledRadix4Loop
250
251 lastStageUnscaledRadix4:
252 BL armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
253 B FFTEnd
254
255
256 FFTEnd:
257 finalComplexToRealFixup:
258
259
260 @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
261 @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
262 @// 1/2[2a+j0] - j [0+j2b]
263 @// (a+b, 0)
264
265 @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
266 @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
267 @// 1/2[2a+j0] + j [0+j2b]
268 @// (a-b, 0)
269
270 @// F(0) and F(N/2)
271 VLD2 {dX0r[0],dX0i[0]},[pSrc]!
272 MOV zero,#0
273 VMOV dX0r[1],zero
274 MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes
275 VMOV dX0i[1],zero
276 @// twStep = 3N/8 * 8 bytes pointing to W^1
277 SUB twStep,step,subFFTSize,LSL #1
278
279 VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
280 MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes
281 VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)
282 SUBS subFFTSize,subFFTSize,#2
283
284 VST1 dY0r,[argDst],step
285 ADD pTwiddleTmp,argTwiddle,#8 @// W^2
286 VST1 dY0i,[argDst]!
287 ADD argTwiddle,argTwiddle,twStep @// W^1
288
289 VDUP dzero,zero
290 SUB argDst,argDst,step
291
292 BLT End
293 BEQ lastElement
294 SUB step,step,#24
295 SUB step1,step1,#8 @// (N/4-1)*8 bytes
296
297 @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
298 @// Note: W^k is stored as negative values in the table
299 @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
300 @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
301
302
303 LDR t0, =HALF
304 VLD1 half[0], [t0]
305
306 evenOddButterflyLoop:
307
308
309 VLD1 dW0r,[argTwiddle],step1
310 VLD1 dW1r,[argTwiddle]!
311
312 VLD2 {dX0r,dX0i},[pSrc],step
313 SUB argTwiddle,argTwiddle,step1
314 VLD2 {dX1r,dX1i},[pSrc]!
315
316
317
318 SUB step1,step1,#8 @// (N/4-2)*8 bytes
319 VLD1 dW0i,[pTwiddleTmp],step1
320 VLD1 dW1i,[pTwiddleTmp]!
321 SUB pSrc,pSrc,step
322
323 SUB pTwiddleTmp,pTwiddleTmp,step1
324 VREV64 dX1r,dX1r
325 VREV64 dX1i,dX1i
326 SUBS subFFTSize,subFFTSize,#4
327
328
329
330 VSUB dT2,dX0r,dX1r @// a-c
331 SUB step1,step1,#8
332 VADD dT0,dX0r,dX1r @// a+c
333 VSUB dT1,dX0i,dX1i @// b-d
334 VADD dT3,dX0i,dX1i @// b+d
335 VMUL dT0,dT0,half[0]
336 VMUL dT1,dT1,half[0]
337 VZIP dW1r,dW1i
338 VZIP dW0r,dW0i
339
340
341 VMUL qT0,dW1r,dT2
342 VMUL qT1,dW1r,dT3
343 VMUL qT2,dW0r,dT2
344 VMUL qT3,dW0r,dT3
345
346 VMLA qT0,dW1i,dT3
347 VMLS qT1,dW1i,dT2
348
349 VMLS qT2,dW0i,dT3
350 VMLA qT3,dW0i,dT2
351
352
353 VMUL dX1r,qT0,half[0]
354 VMUL dX1i,qT1,half[0]
355
356 VSUB dY1r,dT0,dX1i @// F(N/2 -1)
357 VADD dY1i,dT1,dX1r
358 VNEG dY1i,dY1i
359
360 VREV64 dY1r,dY1r
361 VREV64 dY1i,dY1i
362
363
364 VMUL dX0r,qT2,half[0]
365 VMUL dX0i,qT3,half[0]
366
367 VSUB dY0r,dT0,dX0i @// F(1)
368 VADD dY0i,dT1,dX0r
369
370
371 VST2 {dY0r,dY0i},[argDst],step
372 VST2 {dY1r,dY1i},[argDst]!
373 SUB argDst,argDst,step
374 SUB step,step,#32 @// (N/2-4)*8 bytes
375
376
377 BGT evenOddButterflyLoop
378
379 @// set both the ptrs to the last element
380 SUB pSrc,pSrc,#8
381 SUB argDst,argDst,#8
382
383
384
385 @// Last element can be expanded as follows
386 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
387 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
388 @// 1/2[2a+j0] + j (c+jd) [0+j2b]
389 @// (a-bc, -bd)
390 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
391
392 lastElement:
393 VLD1 dX0r,[pSrc]
394
395 VST1 dX0r[0],[argDst]!
396 VNEG dX0r,dX0r
397 VST1 dX0r[1],[argDst]!
398
399 End:
400 @// Set return value
401 MOV result, #OMX_Sts_NoErr
402
403 @// Write function tail
404 M_END
405
406 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698