Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(22)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This file was originally licensed as follows. It has been
11 @// relicensed with permission from the copyright holders.
12 @//
13
14 @//
15 @// File Name: armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
16 @// OpenMAX DL: v1.0.2
17 @// Last Modified Revision: 7767
18 @// Last Modified Date: Thu, 27 Sep 2007
19 @//
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @//
22 @//
23 @//
24 @// Description:
25 @// Compute a Radix 4 FFT stage for a N point complex signal
26 @//
27
28
29
30
31 @// Include standard headers
32
33 #include "dl/api/armCOMM_s.h"
34 #include "dl/api/omxtypes_s.h"
35
36
37 @// Import symbols required from other files
38 @// (For example tables)
39
40
41
42
43 @// Set debugging level
44 @//DEBUG_ON SETL {TRUE}
45
46
47
48 @// Guarding implementation by the processor name
49
50
51
52
53 @// Guarding implementation by the processor name
54
55
56 @// Import symbols required from other files
57 @// (For example tables)
58
59
60 @//Input Registers
61
62 #define pSrc r0
63 #define pDst r2
64 #define pTwiddle r1
65 #define subFFTNum r6
66 #define subFFTSize r7
67
68
69
70 @//Output Registers
71
72
73 @//Local Scratch Registers
74
75 #define grpCount r3
76 #define pointStep r4
77 #define outPointStep r5
78 #define stepTwiddle r12
79 #define setCount r14
80 #define srcStep r8
81 #define setStep r9
82 #define dstStep r10
83 #define twStep r11
84 #define t1 r3
85
86 @// Neon Registers
87
88 #define dW1 D0.S32
89 #define dW2 D1.S32
90 #define dW3 D2.S32
91
92 #define dXr0 D4.S32
93 #define dXi0 D5.S32
94 #define dXr1 D6.S32
95 #define dXi1 D7.S32
96 #define dXr2 D8.S32
97 #define dXi2 D9.S32
98 #define dXr3 D10.S32
99 #define dXi3 D11.S32
100 #define dYr0 D12.S32
101 #define dYi0 D13.S32
102 #define dYr1 D14.S32
103 #define dYi1 D15.S32
104 #define dYr2 D16.S32
105 #define dYi2 D17.S32
106 #define dYr3 D18.S32
107 #define dYi3 D19.S32
108 #define qT0 Q8.S64
109 #define qT1 Q9.S64
110 #define qT2 Q6.S64
111 #define qT3 Q7.S64
112
113 #define dZr0 D20.S32
114 #define dZi0 D21.S32
115 #define dZr1 D22.S32
116 #define dZi1 D23.S32
117 #define dZr2 D24.S32
118 #define dZi2 D25.S32
119 #define dZr3 D26.S32
120 #define dZi3 D27.S32
121
122 #define qY0 Q6.S32
123 #define qY1 Q7.S32
124 #define qY2 Q8.S32
125 #define qY3 Q9.S32
126 #define qX0 Q2.S32
127 #define qZ0 Q10.S32
128 #define qZ1 Q11.S32
129 #define qZ2 Q12.S32
130 #define qZ3 Q13.S32
131
132
133 .MACRO FFTSTAGE scaled, inverse , name
134
135 @// Define stack arguments
136
137
138 @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
139
140 LSL grpCount,subFFTSize,#2
141 LSR subFFTNum,subFFTNum,#2
142 MOV subFFTSize,grpCount
143
144 VLD1 dW1,[pTwiddle] @//[wi | wr]
145 @// pT0+1 increments pT0 by 8 bytes
146 @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
147 MOV pointStep,subFFTNum,LSL #1
148
149
150 @// pOut0+1 increments pOut0 by 8 bytes
151 @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size byt es
152
153 MOV stepTwiddle,#0
154 VLD1 dW2,[pTwiddle] @//[wi | wr]
155 SMULBB outPointStep,grpCount,pointStep
156 LSL pointStep,pointStep,#2 @// 2*grpSize
157
158 VLD1 dW3,[pTwiddle] @//[wi | wr]
159 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*poin tStep
160 ADD setStep,srcStep,pointStep @// setStep = 3*poin tStep
161 @//RSB setStep,setStep,#16 @// setStep = - 3 *pointStep+16
162 RSB setStep,setStep,#0 @// setStep = - 3*poi ntStep
163 SUB srcStep,srcStep,#16 @// srcStep = 2*poin tStep-16
164
165 MOV dstStep,outPointStep,LSL #1
166 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outP ointStep
167 RSB dstStep,dstStep,#16 @// dstStep = - 3*o utPointStep+16
168
169
170
171 grpLoop\name :
172
173 VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0]
174 ADD stepTwiddle,stepTwiddle,pointStep
175 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1]
176 ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point
177 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2]
178 MOV twStep,stepTwiddle,LSL #2
179
180 VLD2 {dXr3,dXi3},[pSrc],setStep @// data[3] & updat e pSrc for the next set
181 SUB twStep,stepTwiddle,twStep @// twStep = -3*step Twiddle
182
183 MOV setCount,pointStep,LSR #3
184 ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set
185 ADD pSrc,pSrc,pointStep @// increment to data[1] o f the next set
186
187
188 @// Loop on the sets
189
190 setLoop\name :
191
192
193
194 SUBS setCount,setCount,#2 @// decrement the loop c ounter
195
196 .ifeqs "\inverse", "TRUE"
197 VMULL qT0,dXr1,dW1[0]
198 VMLAL qT0,dXi1,dW1[1] @// real part
199 VMULL qT1,dXi1,dW1[0]
200 VMLSL qT1,dXr1,dW1[1] @// imag part
201
202 .else
203 VMULL qT0,dXr1,dW1[0]
204 VMLSL qT0,dXi1,dW1[1] @// real part
205 VMULL qT1,dXi1,dW1[0]
206 VMLAL qT1,dXr1,dW1[1] @// imag part
207
208 .endif
209
210 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1] for next iteration
211
212 .ifeqs "\inverse", "TRUE"
213 VMULL qT2,dXr2,dW2[0]
214 VMLAL qT2,dXi2,dW2[1] @// real part
215 VMULL qT3,dXi2,dW2[0]
216 VMLSL qT3,dXr2,dW2[1] @// imag part
217
218 .else
219 VMULL qT2,dXr2,dW2[0]
220 VMLSL qT2,dXi2,dW2[1] @// real part
221 VMULL qT3,dXi2,dW2[0]
222 VMLAL qT3,dXr2,dW2[1] @// imag part
223
224 .endif
225
226 VRSHRN dZr1,qT0,#31
227 VRSHRN dZi1,qT1,#31
228 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2] for next iteration
229
230
231 .ifeqs "\inverse", "TRUE"
232 VMULL qT0,dXr3,dW3[0]
233 VMLAL qT0,dXi3,dW3[1] @// real part
234 VMULL qT1,dXi3,dW3[0]
235 VMLSL qT1,dXr3,dW3[1] @// imag part
236
237 .else
238 VMULL qT0,dXr3,dW3[0]
239 VMLSL qT0,dXi3,dW3[1] @// real part
240 VMULL qT1,dXi3,dW3[0]
241 VMLAL qT1,dXr3,dW3[1] @// imag part
242
243 .endif
244
245 VRSHRN dZr2,qT2,#31
246 VRSHRN dZi2,qT3,#31
247
248
249 VRSHRN dZr3,qT0,#31
250 VRSHRN dZi3,qT1,#31
251 VLD2 {dXr3,dXi3},[pSrc],setStep @// data[3] & update pSrc to data[0]
252
253 .ifeqs "\scaled", "TRUE"
254
255 @// finish first stage of 4 point FFT
256 VHADD qY0,qX0,qZ2
257 VHSUB qY2,qX0,qZ2
258
259 VLD2 {dXr0,dXi0},[pSrc]! @// data[0] for next iteration
260 VHADD qY1,qZ1,qZ3
261 VHSUB qY3,qZ1,qZ3
262
263 @// finish second stage of 4 point FFT
264
265 VHSUB qZ0,qY2,qY1
266
267
268 .ifeqs "\inverse", "TRUE"
269
270 VHADD dZr3,dYr0,dYi3
271 VST2 {dZr0,dZi0},[pDst :128],outPointStep
272 VHSUB dZi3,dYi0,dYr3
273
274 VHADD qZ2,qY2,qY1
275 VST2 {dZr3,dZi3},[pDst :128],outPointStep
276
277 VHSUB dZr1,dYr0,dYi3
278 VST2 {dZr2,dZi2},[pDst :128],outPointStep
279 VHADD dZi1,dYi0,dYr3
280
281 VST2 {dZr1,dZi1},[pDst :128],dstStep
282
283
284 .else
285
286 VHSUB dZr1,dYr0,dYi3
287 VST2 {dZr0,dZi0},[pDst :128],outPointStep
288 VHADD dZi1,dYi0,dYr3
289
290 VHADD qZ2,qY2,qY1
291 VST2 {dZr1,dZi1},[pDst :128],outPointStep
292
293 VHADD dZr3,dYr0,dYi3
294 VST2 {dZr2,dZi2},[pDst :128],outPointStep
295 VHSUB dZi3,dYi0,dYr3
296
297 VST2 {dZr3,dZi3},[pDst :128],dstStep
298
299
300 .endif
301
302
303 .else
304
305 @// finish first stage of 4 point FFT
306 VADD qY0,qX0,qZ2
307 VSUB qY2,qX0,qZ2
308
309 VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] for next iter ation
310 VADD qY1,qZ1,qZ3
311 VSUB qY3,qZ1,qZ3
312
313 @// finish second stage of 4 point FFT
314
315 VSUB qZ0,qY2,qY1
316
317
318 .ifeqs "\inverse", "TRUE"
319
320 VADD dZr3,dYr0,dYi3
321 VST2 {dZr0,dZi0},[pDst :128],outPointStep
322 VSUB dZi3,dYi0,dYr3
323
324 VADD qZ2,qY2,qY1
325 VST2 {dZr3,dZi3},[pDst :128],outPointStep
326
327 VSUB dZr1,dYr0,dYi3
328 VST2 {dZr2,dZi2},[pDst :128],outPointStep
329 VADD dZi1,dYi0,dYr3
330
331 VST2 {dZr1,dZi1},[pDst :128],dstStep
332
333
334 .else
335
336 VSUB dZr1,dYr0,dYi3
337 VST2 {dZr0,dZi0},[pDst :128],outPointStep
338 VADD dZi1,dYi0,dYr3
339
340 VADD qZ2,qY2,qY1
341 VST2 {dZr1,dZi1},[pDst :128],outPointStep
342
343 VADD dZr3,dYr0,dYi3
344 VST2 {dZr2,dZi2},[pDst :128],outPointStep
345 VSUB dZi3,dYi0,dYr3
346
347 VST2 {dZr3,dZi3},[pDst :128],dstStep
348
349
350 .endif
351
352 .endif
353
354 ADD pSrc,pSrc,pointStep @// increment to dat a[1] of the next set
355 BGT setLoop\name
356
357
358 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
359 SUBS grpCount,grpCount,#4 @// subtract 4 since grp Count multiplied by 4
360 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
361 ADD pSrc,pSrc,srcStep @// increment pSrc for t he next grp
362 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
363 BGT grpLoop\name
364
365
366 @// Reset and Swap pSrc and pDst for the next stage
367 MOV t1,pDst
368 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= 2*siz e; pSrc -= 8*size bytes
369 SUB pSrc,t1,outPointStep
370
371
372 .endm
373
374
375 M_START armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
376 FFTSTAGE "FALSE","FALSE",FWD
377 M_END
378
379
380 M_START armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
381 FFTSTAGE "FALSE","TRUE",INV
382 M_END
383
384
385 M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
386 FFTSTAGE "TRUE","FALSE",FWDSFS
387 M_END
388
389
390 M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
391 FFTSTAGE "TRUE","TRUE",INVSFS
392 M_END
393
394
395 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698