Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(77)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @//
11 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
12 @// to support float instead of SC32.
13 @//
14
15 @//
16 @// Description:
17 @// Compute a Radix 4 FFT stage for a N point complex signal
18 @//
19 @//
20
21
22 @// Include standard headers
23
24 #include "dl/api/armCOMM_s.h"
25 #include "dl/api/omxtypes_s.h"
26
27
28 @// Import symbols required from other files
29 @// (For example tables)
30
31
32
33
34 @// Set debugging level
35 @//DEBUG_ON SETL {TRUE}
36
37
38
39 @// Guarding implementation by the processor name
40
41
42
43
44 @// Guarding implementation by the processor name
45
46
47 @// Import symbols required from other files
48 @// (For example tables)
49
50
51 @//Input Registers
52
53 #define pSrc r0
54 #define pDst r2
55 #define pTwiddle r1
56 #define subFFTNum r6
57 #define subFFTSize r7
58
59
60
61 @//Output Registers
62
63
64 @//Local Scratch Registers
65
66 #define grpCount r3
67 #define pointStep r4
68 #define outPointStep r5
69 #define stepTwiddle r12
70 #define setCount r14
71 #define srcStep r8
72 #define setStep r9
73 #define dstStep r10
74 #define twStep r11
75 #define t1 r3
76
77 @// Neon Registers
78
79 #define dW1 D0.F32
80 #define dW2 D1.F32
81 #define dW3 D2.F32
82
83 #define dXr0 D4.F32
84 #define dXi0 D5.F32
85 #define dXr1 D6.F32
86 #define dXi1 D7.F32
87 #define dXr2 D8.F32
88 #define dXi2 D9.F32
89 #define dXr3 D10.F32
90 #define dXi3 D11.F32
91 #define dYr0 D12.F32
92 #define dYi0 D13.F32
93 #define dYr1 D14.F32
94 #define dYi1 D15.F32
95 #define dYr2 D16.F32
96 #define dYi2 D17.F32
97 #define dYr3 D18.F32
98 #define dYi3 D19.F32
99 #define qT0 d16.f32
100 #define qT1 d18.f32
101 #define qT2 d12.f32
102 #define qT3 d14.f32
103 #define dZr0 D20.F32
104 #define dZi0 D21.F32
105 #define dZr1 D22.F32
106 #define dZi1 D23.F32
107 #define dZr2 D24.F32
108 #define dZi2 D25.F32
109 #define dZr3 D26.F32
110 #define dZi3 D27.F32
111
112 #define qY0 Q6.F32
113 #define qY1 Q7.F32
114 #define qY2 Q8.F32
115 #define qY3 Q9.F32
116 #define qX0 Q2.F32
117 #define qZ0 Q10.F32
118 #define qZ1 Q11.F32
119 #define qZ2 Q12.F32
120 #define qZ3 Q13.F32
121
122 .MACRO FFTSTAGE scaled, inverse , name
123
124 @// Define stack arguments
125
126
127 @// Update grpCount and grpSize rightaway inorder to reuse
128 @// pGrpCount and pGrpSize regs
129
130 LSL grpCount,subFFTSize,#2
131 LSR subFFTNum,subFFTNum,#2
132 MOV subFFTSize,grpCount
133
134 VLD1 dW1,[pTwiddle] @//[wi | wr]
135 @// pT0+1 increments pT0 by 8 bytes
136 @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
137 MOV pointStep,subFFTNum,LSL #1
138
139
140 @// pOut0+1 increments pOut0 by 8 bytes
141 @// pOut0+outPointStep == increment of 8*outPointStep bytes
142 @// = 2*size bytes
143
144 MOV stepTwiddle,#0
145 VLD1 dW2,[pTwiddle] @//[wi | wr]
146 SMULBB outPointStep,grpCount,pointStep
147 LSL pointStep,pointStep,#2 @// 2*grpSize
148
149 VLD1 dW3,[pTwiddle] @//[wi | wr]
150 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
151 ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
152
153 RSB setStep,setStep,#0 @// setStep = - 3*pointStep
154 SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16
155
156 MOV dstStep,outPointStep,LSL #1
157 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
158 @// dstStep = - 3*outPointStep+16
159 RSB dstStep,dstStep,#16
160
161
162
163 radix4GrpLoop\name :
164
165 VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0]
166 ADD stepTwiddle,stepTwiddle,pointStep
167 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1]
168 @// set pTwiddle to the first point
169 ADD pTwiddle,pTwiddle,stepTwiddle
170 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2]
171 MOV twStep,stepTwiddle,LSL #2
172
173 @// data[3] & update pSrc for the next set
174 VLD2 {dXr3,dXi3},[pSrc],setStep
175 SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
176
177 MOV setCount,pointStep,LSR #3
178 @// set pSrc to data[0] of the next set
179 ADD pSrc,pSrc,#16
180 @// increment to data[1] of the next set
181 ADD pSrc,pSrc,pointStep
182
183
184 @// Loop on the sets
185
186 radix4SetLoop\name :
187
188
189
190 .ifeqs "\inverse", "TRUE"
191 VMUL dZr1,dXr1,dW1[0]
192 VMUL dZi1,dXi1,dW1[0]
193 VMUL dZr2,dXr2,dW2[0]
194 VMUL dZi2,dXi2,dW2[0]
195 VMUL dZr3,dXr3,dW3[0]
196 VMUL dZi3,dXi3,dW3[0]
197
198 VMLA dZr1,dXi1,dW1[1] @// real part
199 VMLS dZi1,dXr1,dW1[1] @// imag part
200
201 @// data[1] for next iteration
202 VLD2 {dXr1,dXi1},[pSrc],pointStep
203
204 VMLA dZr2,dXi2,dW2[1] @// real part
205 VMLS dZi2,dXr2,dW2[1] @// imag part
206
207 @// data[2] for next iteration
208 VLD2 {dXr2,dXi2},[pSrc],pointStep
209
210 VMLA dZr3,dXi3,dW3[1] @// real part
211 VMLS dZi3,dXr3,dW3[1] @// imag part
212 .else
213 VMUL dZr1,dXr1,dW1[0]
214 VMUL dZi1,dXi1,dW1[0]
215 VMUL dZr2,dXr2,dW2[0]
216 VMUL dZi2,dXi2,dW2[0]
217 VMUL dZr3,dXr3,dW3[0]
218 VMUL dZi3,dXi3,dW3[0]
219
220 VMLS dZr1,dXi1,dW1[1] @// real part
221 VMLA dZi1,dXr1,dW1[1] @// imag part
222
223 @// data[1] for next iteration
224 VLD2 {dXr1,dXi1},[pSrc],pointStep
225
226 VMLS dZr2,dXi2,dW2[1] @// real part
227 VMLA dZi2,dXr2,dW2[1] @// imag part
228
229 @// data[2] for next iteration
230 VLD2 {dXr2,dXi2},[pSrc],pointStep
231
232 VMLS dZr3,dXi3,dW3[1] @// real part
233 VMLA dZi3,dXr3,dW3[1] @// imag part
234 .endif
235
236 @// data[3] & update pSrc to data[0]
237 @// But don't read on the very last iteration because that reads past
238 @// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
239 cmp grpCount, #4
240 cmpeq setCount, #2 @// Test setCount if grpCount = 4
241 @// These are executed only if both grpCount = 4 and setCount = 2
242 addeq pSrc, pSrc, setStep
243 beq radix4SkipRead\name
244 VLD2 {dXr3,dXi3},[pSrc],setStep
245 radix4SkipRead\name:
246 SUBS setCount,setCount,#2
247
248 @// finish first stage of 4 point FFT
249 VADD qY0,qX0,qZ2
250 VSUB qY2,qX0,qZ2
251
252 @// data[0] for next iteration
253 VLD2 {dXr0,dXi0},[pSrc :128]!
254 VADD qY1,qZ1,qZ3
255 VSUB qY3,qZ1,qZ3
256
257 @// finish second stage of 4 point FFT
258
259 VSUB qZ0,qY2,qY1
260
261
262 .ifeqs "\inverse", "TRUE"
263
264 VADD dZr3,dYr0,dYi3
265 VST2 {dZr0,dZi0},[pDst :128],outPointStep
266 VSUB dZi3,dYi0,dYr3
267
268 VADD qZ2,qY2,qY1
269 VST2 {dZr3,dZi3},[pDst :128],outPointStep
270
271 VSUB dZr1,dYr0,dYi3
272 VST2 {dZr2,dZi2},[pDst :128],outPointStep
273 VADD dZi1,dYi0,dYr3
274
275 VST2 {dZr1,dZi1},[pDst :128],dstStep
276
277
278 .else
279
280 VSUB dZr1,dYr0,dYi3
281 VST2 {dZr0,dZi0},[pDst :128],outPointStep
282 VADD dZi1,dYi0,dYr3
283
284 VADD qZ2,qY2,qY1
285 VST2 {dZr1,dZi1},[pDst :128],outPointStep
286
287 VADD dZr3,dYr0,dYi3
288 VST2 {dZr2,dZi2},[pDst :128],outPointStep
289 VSUB dZi3,dYi0,dYr3
290
291 VST2 {dZr3,dZi3},[pDst :128],dstStep
292
293
294 .endif
295
296 @// increment to data[1] of the next set
297 ADD pSrc,pSrc,pointStep
298 BGT radix4SetLoop\name
299
300
301 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
302 @// subtract 4 since grpCount multiplied by 4
303 SUBS grpCount,grpCount,#4
304 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
305 @// increment pSrc for the next grp
306 ADD pSrc,pSrc,srcStep
307 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
308 BGT radix4GrpLoop\name
309
310
311 @// Reset and Swap pSrc and pDst for the next stage
312 MOV t1,pDst
313 @// pDst -= 2*size; pSrc -= 8*size bytes
314 SUB pDst,pSrc,outPointStep,LSL #2
315 SUB pSrc,t1,outPointStep
316
317
318 .endm
319
320
321 M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
322 FFTSTAGE "FALSE","FALSE",FWD
323 M_END
324
325
326 M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
327 FFTSTAGE "FALSE","TRUE",INV
328 M_END
329
330
331 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698