Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(209)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This file was originally licensed as follows. It has been
11 @// relicensed with permission from the copyright holders.
12 @//
13
14 @//
15 @// File Name: omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
16 @// OpenMAX DL: v1.0.2
17 @// Last Modified Revision: 7810
18 @// Last Modified Date: Thu, 04 Oct 2007
19 @//
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @//
22 @//
23 @//
24 @// Description:
25 @// Compute FFT for a real signal
26 @//
27
28
29
30 @// Include standard headers
31
32 #include "dl/api/armCOMM_s.h"
33 #include "dl/api/omxtypes_s.h"
34
35
36 @// Import symbols required from other files
37 @// (For example tables)
38
39 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe
40 .extern armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
41 .extern armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe
42 .extern armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
43 .extern armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
44 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe
45 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe
46 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
47 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
48 .extern armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
49
50 @// Set debugging level
51 @//DEBUG_ON SETL {TRUE}
52
53
54
55 @// Guarding implementation by the processor name
56
57
58
59 @// Guarding implementation by the processor name
60
61 @// Import symbols required from other files
62 @// (For example tables)
63 .extern armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
64 .extern armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
65 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
66 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
67
68
69 @//Input Registers
70
71 #define pSrc r0
72 #define pDst r1
73 #define pFFTSpec r2
74 #define scale r3
75
76
77 @// Output registers
78 #define result r0
79
80 @//Local Scratch Registers
81
82 #define argTwiddle r1
83 #define argDst r2
84 #define argScale r4
85 #define tmpOrder r4
86 #define pTwiddle r4
87 #define pOut r5
88 #define subFFTSize r7
89 #define subFFTNum r6
90 #define N r6
91 #define order r14
92 #define diff r9
93 @// Total num of radix stages required to comple the FFT
94 #define count r8
95 #define x0r r4
96 #define x0i r5
97 #define diffMinusOne r2
98 #define subFFTSizeTmp r6
99 #define step r3
100 #define step1 r4
101 #define twStep r8
102 #define zero r9
103 #define pTwiddleTmp r5
104 #define t0 r10
105
106 @// Neon registers
107
108 #define dX0 d0.s32
109 #define dzero d1.s32
110 #define dZero d2.s32
111 #define dShift d3.s32
112 #define dX0r d2.s32
113 #define dX0i d3.s32
114 #define dX1r d4.s32
115 #define dX1i d5.s32
116 #define dT0 d6.s32
117 #define dT1 d7.s32
118 #define dT2 d8.s32
119 #define dT3 d9.s32
120 #define qT0 q5.s64
121 #define qT1 q6.s64
122 #define dW0r d14.s32
123 #define dW0i d15.s32
124 #define dW1r d16.s32
125 #define dW1i d17.s32
126 #define dY0r d14.s32
127 #define dY0i d15.s32
128 #define dY1r d16.s32
129 #define dY1i d17.s32
130 #define dY0rS64 d14.s64
131 #define dY0iS64 d15.s64
132 #define qT2 q9.s64
133 #define qT3 q10.s64
134 @// lastThreeelements
135 #define dX1 d3.s32
136 #define dW0 d4.s32
137 #define dW1 d5.s32
138 #define dY0 d10.s32
139 #define dY1 d11.s32
140 #define dY2 d12.s32
141 #define dY3 d13.s32
142
143 @// Allocate stack memory required by the function
144
145 M_ALLOC4 diffOnStack, 4
146
147 @// Write function header
148 M_START omxSP_FFTFwd_RToCCS_S32_Sfs,r11,d15
149
150 @ Structure offsets for the FFTSpec
151 .set ARMsFFTSpec_N, 0
152 .set ARMsFFTSpec_pBitRev, 4
153 .set ARMsFFTSpec_pTwiddle, 8
154 .set ARMsFFTSpec_pBuf, 12
155
156 @// Define stack arguments
157
158 @// Read the size from structure and take log
159 LDR N, [pFFTSpec, #ARMsFFTSpec_N]
160
161 @// Read other structure parameters
162 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
163 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
164
165 @// N=1 Treat seperately
166 CMP N,#1
167 BGT sizeGreaterThanOne
168 VLD1 dX0[0],[pSrc]
169 RSB scale,scale,#0 @// to use VRSHL for right shift by a variable
170 MOV zero,#0
171 VMOV dShift[0],scale
172 VMOV dzero[0],zero
173 VRSHL dX0,dShift
174 VMOV dZero[0],zero
175 VST3 {dX0[0],dzero[0],dZero[0]},[pDst]
176
177 B End
178
179
180
181 sizeGreaterThanOne:
182 @// Do a N/2 point complex FFT including the scaling
183
184 MOV N,N,ASR #1 @// N/2 point complex FFT
185
186 CLZ order,N @// N = 2^order
187 RSB order,order,#31
188 MOV subFFTSize,#1
189 @//MOV subFFTNum,N
190
191 CMP order,#3
192 BGT orderGreaterthan3 @// order > 3
193
194 CMP order,#1
195 BGE orderGreaterthan0 @// order > 0
196 M_STR scale, diffOnStack,LT @// order = 0
197 VLD1 dX0,[pSrc]
198 VST1 dX0,[pOut]
199 MOV pSrc,pOut
200 MOV argDst,pDst
201 BLT FFTEnd
202
203 orderGreaterthan0:
204 @// set the buffers appropriately for various orders
205 CMP order,#2
206 MOVEQ argDst,pDst
207 MOVNE argDst,pOut
208 MOVNE pOut,pDst @// Pass the first stage des tination in RN5
209 MOV argTwiddle,pTwiddle
210
211 SUBS diff,scale,order
212 M_STR diff,diffOnStack
213 MOVGT scale,order
214 @// Now scale <= order
215
216 CMP order,#1
217 BGT orderGreaterthan1
218 SUBS scale,scale,#1
219 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// orde r = 1
220 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe @// orde r = 1
221 B FFTEnd
222
223 orderGreaterthan1:
224 CMP order,#2
225 MOV argScale,scale
226 BGT orderGreaterthan2
227 SUBS argScale,argScale,#1
228 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// order =2
229 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
230 SUBS argScale,argScale,#1
231 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
232 BLLT armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
233 B FFTEnd
234
235 orderGreaterthan2:@// order =3
236 SUBS argScale,argScale,#1
237 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe
238 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
239 SUBS argScale,argScale,#1
240 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
241 BLLT armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
242 SUBS argScale,argScale,#1
243 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
244 BLLT armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
245 B FFTEnd
246
247
248
249 orderGreaterthan3:
250 @// check scale = 0 or scale = order
251 SUBS diff, scale, order @// scale > order
252 MOVGT scale,order
253 BGE specialScaleCase @// scale = 0 or scale = orde r
254 CMP scale,#0
255 BEQ specialScaleCase
256 B generalScaleCase
257
258 specialScaleCase:@// scale = 0 or scale = order and order >= 2
259
260 TST order, #2 @// Set input args to fft st ages
261 MOVEQ argDst,pDst
262 MOVNE argDst,pOut
263 MOVNE pOut,pDst @// Pass the first stage des tination in RN5
264 MOV argTwiddle,pTwiddle
265
266 CMP diff,#0
267 M_STR diff, diffOnStack
268 BGE scaleEqualsOrder
269
270 @//check for even or odd order
271 @// NOTE: The following combination of BL's would work fine eventhough t he first
272 @// BL would corrupt the flags. This is because the end of the "grpZeroS etLoop" loop inside
273 @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag t o EQ
274
275 TST order,#0x00000001
276 BLEQ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe
277 BLNE armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
278
279 CMP subFFTNum,#4
280 BLT FFTEnd
281
282
283 unscaledRadix4Loop:
284 BEQ lastStageUnscaledRadix4
285 BL armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
286 CMP subFFTNum,#4
287 B unscaledRadix4Loop
288
289 lastStageUnscaledRadix4:
290 BL armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
291 B FFTEnd
292
293
294 scaleEqualsOrder:
295 @//check for even or odd order
296 @// NOTE: The following combination of BL's would work fine eventhough t he first
297 @// BL would corrupt the flags. This is because the end of the "grpZeroS etLoop" loop inside
298 @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag t o EQ
299
300 TST order,#0x00000001
301 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe
302 BLNE armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe
303
304 CMP subFFTNum,#4
305 BLT FFTEnd
306
307
308 scaledRadix4Loop:
309 BEQ lastStageScaledRadix4
310 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
311 CMP subFFTNum,#4
312 B scaledRadix4Loop
313
314 lastStageScaledRadix4:
315 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
316 B FFTEnd
317
318 generalScaleCase:@// 0 < scale < order and order >= 2
319 @// Determine the correct destination buffer
320 SUB diff,order,scale
321 TST diff,#0x01
322 ADDEQ count, scale,diff,lsr #1 @// count = scale + (order - sc ale)/2
323 MOVNE count, order
324 TST count, #0x01 @// Is count even or odd ?
325
326 MOVEQ argDst,pDst @// Set input args to fft stages
327 MOVNE argDst,pOut
328 MOVNE pOut,pDst @// Pass the first stage destina tion in RN5
329 MOV argTwiddle,pTwiddle
330
331 M_STR diff, diffOnStack
332
333 MOV argScale,scale @// Put scale in RN4 so as to sa ve and restore
334 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// s caled first stage
335 SUBS argScale,argScale,#1
336
337 scaledRadix2Loop:
338 BLGT armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
339 SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages
340 BGT scaledRadix2Loop
341
342
343 M_LDR diff, diffOnStack
344 @//check for even or odd order
345 TST diff,#0x00000001
346 BEQ generalUnscaledRadix4Loop
347 B unscaledRadix2Loop
348
349 generalUnscaledRadix4Loop:
350 CMP subFFTNum,#4
351 BEQ generalLastStageUnscaledRadix4
352 BL armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
353 B generalUnscaledRadix4Loop
354
355 generalLastStageUnscaledRadix4:
356 BL armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
357 B finalComplexToRealFixup
358
359
360 unscaledRadix2Loop:
361 CMP subFFTNum,#2
362 BEQ generalLastStageUnscaledRadix2
363 BL armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
364 B unscaledRadix2Loop
365
366 generalLastStageUnscaledRadix2:
367 BL armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
368 B finalComplexToRealFixup
369
370
371 FFTEnd:@// Does only the scaling
372
373 M_LDR diff, diffOnStack
374 CMP diff,#0
375 BLE finalComplexToRealFixup
376
377 RSB diff,diff,#0 @// to use VRSHL for right s hift by a variable
378 VDUP dShift,diff
379
380 @// save subFFTSize and use tmpsubfftsize in the folowwing loop
381 MOV subFFTSizeTmp,subFFTSize @// subFFTSizeTmp same r eg as subFFTNum
382
383 scaleFFTData:@// N = subFFTSize ; dataptr = pDst ; scale = diff
384 VLD1 {dX0},[pSrc] @// pSrc contains pDst pointer
385 SUBS subFFTSizeTmp,subFFTSizeTmp,#1
386 VRSHL dX0,dShift
387 VST1 {dX0},[pSrc]!
388
389 BGT scaleFFTData
390
391 SUB pSrc,pSrc,subFFTSize,LSL #3 @// reset pSrc for final fixup
392
393 @// change the logic so that output after scaling is in pOut and not in pDst
394 @// finally store from pOut to pDst
395 @// change branch "End" to branch "finalComplexToRealFixup" in the abov e
396 @// chk the code below for multiplication by j factor
397
398 finalComplexToRealFixup:
399
400
401 @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
402 @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
403 @// 1/2[2a+j0] - j [0+j2b]
404 @// (a+b, 0)
405
406 @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
407 @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
408 @// 1/2[2a+j0] + j [0+j2b]
409 @// (a-b, 0)
410
411 @// F(0) and F(N/2)
412 VLD2 {dX0r[0],dX0i[0]},[pSrc]!
413 MOV zero,#0
414 VMOV dX0r[1],zero
415 MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes
416 VMOV dX0i[1],zero
417 SUB twStep,step,subFFTSize,LSL #1 @// twStep = 3N/8 * 8 by tes pointing to W^1
418
419 VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
420 MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 byte s
421 VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0 .i) , 0)
422 SUBS subFFTSize,subFFTSize,#2
423
424 VST1 dY0r,[argDst],step
425 ADD pTwiddleTmp,argTwiddle,#8 @// W^2
426 VST1 dY0i,[argDst]!
427 ADD argTwiddle,argTwiddle,twStep @// W^1
428
429 VDUP dzero,zero
430 SUB argDst,argDst,step
431
432 BLT End
433 BEQ lastElement
434 SUB step,step,#24
435 SUB step1,step1,#8 @// (N/4-1)*8 bytes
436
437 @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
438 @// Note: W^k is stored as negative values in the table
439 @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) s ince both of them
440 @// require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
441
442
443 evenOddButterflyLoop:
444
445
446 VLD1 dW0r,[argTwiddle],step1
447 VLD1 dW1r,[argTwiddle]!
448
449 VLD2 {dX0r,dX0i},[pSrc],step
450 SUB argTwiddle,argTwiddle,step1
451 VLD2 {dX1r,dX1i},[pSrc]!
452
453
454
455 SUB step1,step1,#8 @// (N/4-2)*8 bytes
456 VLD1 dW0i,[pTwiddleTmp],step1
457 VLD1 dW1i,[pTwiddleTmp]!
458 SUB pSrc,pSrc,step
459
460 SUB pTwiddleTmp,pTwiddleTmp,step1
461 VREV64 dX1r,dX1r
462 VREV64 dX1i,dX1i
463 SUBS subFFTSize,subFFTSize,#4
464
465
466
467 VSUB dT2,dX0r,dX1r @// a-c
468 SUB step1,step1,#8
469 VADD dT3,dX0i,dX1i @// b+d
470 VADD dT0,dX0r,dX1r @// a+c
471 VSUB dT1,dX0i,dX1i @// b-d
472 VHADD dT0,dT0,dzero
473 VHADD dT1,dT1,dzero
474
475 VZIP dW1r,dW1i
476 vzip dW0r,dW0i
477
478
479 VMULL qT0,dW1r,dT2
480 VMLAL qT0,dW1i,dT3
481 VMULL qT1,dW1r,dT3
482 VMLSL qT1,dW1i,dT2
483
484 VMULL qT2,dW0r,dT2
485 VMLSL qT2,dW0i,dT3
486 VMULL qT3,dW0r,dT3
487 VMLAL qT3,dW0i,dT2
488
489
490 VRSHRN dX1r,qT0,#32
491 VRSHRN dX1i,qT1,#32
492
493 VSUB dY1r,dT0,dX1i @// F(N/2 -1)
494 VADD dY1i,dT1,dX1r
495 VNEG dY1i,dY1i
496
497 VREV64 dY1r,dY1r
498 VREV64 dY1i,dY1i
499
500
501 VRSHRN dX0r,qT2,#32
502 VRSHRN dX0i,qT3,#32
503
504
505 VSUB dY0r,dT0,dX0i @// F(1)
506 VADD dY0i,dT1,dX0r
507
508
509 VST2 {dY0r,dY0i},[argDst],step
510 VST2 {dY1r,dY1i},[argDst]!
511 SUB argDst,argDst,step
512 SUB step,step,#32 @// (N/2-4)*8 bytes
513
514
515 BGT evenOddButterflyLoop
516
517 SUB pSrc,pSrc,#8 @// set both the ptrs to the last el ement
518 SUB argDst,argDst,#8
519
520
521
522 @// Last element can be expanded as follows
523 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
524 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
525 @// 1/2[2a+j0] + j (c+jd) [0+j2b]
526 @// (a-bc, -bd)
527 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
528
529 lastElement:
530 VLD1 dX0r,[pSrc]
531
532 VST1 dX0r[0],[argDst]!
533 VNEG dX0r,dX0r
534 VST1 dX0r[1],[argDst]!
535
536
537
538
539
540
541 End:
542 @// Set return value
543 MOV result, #OMX_Sts_NoErr
544
545 @// Write function tail
546 M_END
547
548 .end
549
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698