Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
11 @// to support float instead of SC32.
12 @//
13
14 @//
15 @// Description:
16 @// Compute a first stage Radix 8 FFT stage for a N point complex signal
17 @//
18 @//
19
20
21 @// Include standard headers
22
23 #include "dl/api/armCOMM_s.h"
24 #include "dl/api/omxtypes_s.h"
25
26 @// Import symbols required from other files
27 @// (For example tables)
28
29
30 @// Set debugging level
31 @//DEBUG_ON SETL {TRUE}
32
33
34
35 @// Guarding implementation by the processor name
36
37
38
39
40 @// Guarding implementation by the processor name
41
42 @//Input Registers
43
44 #define pSrc r0
45 #define pDst r2
46 #define pTwiddle r1
47 #define subFFTNum r6
48 #define subFFTSize r7
49 @// dest buffer for the next stage (not pSrc for first stage)
50 #define pPingPongBuf r5
51
52
53 @//Output Registers
54
55
56 @//Local Scratch Registers
57
58 #define grpSize r3
59 @// Reuse grpSize as setCount
60 #define setCount r3
61 #define pointStep r4
62 #define outPointStep r4
63 #define setStep r8
64 #define step1 r9
65 #define step2 r10
66 #define t0 r11
67
68
69 @// Neon Registers
70
71 #define dXr0 D0.F32
72 #define dXi0 D1.F32
73 #define dXr1 D2.F32
74 #define dXi1 D3.F32
75 #define dXr2 D4.F32
76 #define dXi2 D5.F32
77 #define dXr3 D6.F32
78 #define dXi3 D7.F32
79 #define dXr4 D8.F32
80 #define dXi4 D9.F32
81 #define dXr5 D10.F32
82 #define dXi5 D11.F32
83 #define dXr6 D12.F32
84 #define dXi6 D13.F32
85 #define dXr7 D14.F32
86 #define dXi7 D15.F32
87 #define qX0 Q0.F32
88 #define qX1 Q1.F32
89 #define qX2 Q2.F32
90 #define qX3 Q3.F32
91 #define qX4 Q4.F32
92 #define qX5 Q5.F32
93 #define qX6 Q6.F32
94 #define qX7 Q7.F32
95
96 #define dUr0 D16.F32
97 #define dUi0 D17.F32
98 #define dUr2 D18.F32
99 #define dUi2 D19.F32
100 #define dUr4 D20.F32
101 #define dUi4 D21.F32
102 #define dUr6 D22.F32
103 #define dUi6 D23.F32
104 #define dUr1 D24.F32
105 #define dUi1 D25.F32
106 #define dUr3 D26.F32
107 #define dUi3 D27.F32
108 #define dUr5 D28.F32
109 #define dUi5 D29.F32
110 @// reuse dXr7 and dXi7
111 #define dUr7 D30.F32
112 #define dUi7 D31.F32
113 #define qU0 Q8.F32
114 #define qU1 Q12.F32
115 #define qU2 Q9.F32
116 #define qU3 Q13.F32
117 #define qU4 Q10.F32
118 #define qU5 Q14.F32
119 #define qU6 Q11.F32
120 #define qU7 Q15.F32
121
122
123 #define dVr0 D24.F32
124 #define dVi0 D25.F32
125 #define dVr2 D26.F32
126 #define dVi2 D27.F32
127 #define dVr4 D28.F32
128 #define dVi4 D29.F32
129 #define dVr6 D30.F32
130 #define dVi6 D31.F32
131 #define dVr1 D16.F32
132 #define dVi1 D17.F32
133 #define dVr3 D18.F32
134 #define dVi3 D19.F32
135 #define dVr5 D20.F32
136 #define dVi5 D21.F32
137 #define dVr7 D22.F32
138 #define dVi7 D23.F32
139 #define qV0 Q12.F32
140 #define qV1 Q8.F32
141 #define qV2 Q13.F32
142 #define qV3 Q9.F32
143 #define qV4 Q14.F32
144 #define qV5 Q10.F32
145 #define qV6 Q15.F32
146 #define qV7 Q11.F32
147
148 #define dYr0 D16.F32
149 #define dYi0 D17.F32
150 #define dYr2 D18.F32
151 #define dYi2 D19.F32
152 #define dYr4 D20.F32
153 #define dYi4 D21.F32
154 #define dYr6 D22.F32
155 #define dYi6 D23.F32
156 #define dYr1 D24.F32
157 #define dYi1 D25.F32
158 #define dYr3 D26.F32
159 #define dYi3 D27.F32
160 #define dYr5 D28.F32
161 #define dYi5 D29.F32
162 #define dYr7 D30.F32
163 #define dYi7 D31.F32
164 #define qY0 Q8.F32
165 #define qY1 Q12.F32
166 #define qY2 Q9.F32
167 #define qY3 Q13.F32
168 #define qY4 Q10.F32
169 #define qY5 Q14.F32
170 #define qY6 Q11.F32
171 #define qY7 Q15.F32
172
173 #define dT0 D14.F32
174 #define dT1 D15.F32
175
176 @// Define constants
177 @ sqrt(1/2)
178 ONEBYSQRT2: .float 0.7071067811865476e0
179
180
181 .MACRO FFTSTAGE scaled, inverse, name
182
183 @// Define stack arguments
184
185 @// Update pSubFFTSize and pSubFFTNum regs
186 @// subFFTSize = 1 for the first stage
187 MOV subFFTSize,#8
188 LDR t0,=ONEBYSQRT2
189
190 @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
191 LSR grpSize,subFFTNum,#3
192 MOV subFFTNum,grpSize
193
194
195 @// pT0+1 increments pT0 by 8 bytes
196 @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
197 @// Note: outPointStep = pointStep for firststage
198
199 MOV pointStep,grpSize,LSL #3
200
201
202 @// Calculate the step of input data for the next set
203 @//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
204 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
205 MOV step1,grpSize,LSL #4
206
207 MOV step2,pointStep,LSL #3
208 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
209 SUB step2,step2,pointStep @// step2 = 7*pointStep
210 @// setStep = - 7*pointStep+16
211 RSB setStep,step2,#16
212
213 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
214 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
215 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
216 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
217 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
218 @// data[7] & update pSrc for the next set
219 @// setStep = -7*pointStep + 16
220 VLD2 {dXr7,dXi7},[pSrc :128],setStep
221 @// grp = 0 a special case since all the twiddle factors are 1
222 @// Loop on the sets
223
224 radix8fsGrpZeroSetLoop\name :
225
226 @// Decrement setcount
227 SUBS setCount,setCount,#2
228
229
230 @// finish first stage of 8 point FFT
231
232 VADD qU0,qX0,qX4
233 VADD qU2,qX1,qX5
234 VADD qU4,qX2,qX6
235 VADD qU6,qX3,qX7
236
237 @// finish second stage of 8 point FFT
238
239 VADD qV0,qU0,qU4
240 VSUB qV2,qU0,qU4
241 VADD qV4,qU2,qU6
242 VSUB qV6,qU2,qU6
243
244 @// finish third stage of 8 point FFT
245
246 VADD qY0,qV0,qV4
247 VSUB qY4,qV0,qV4
248 VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
249
250 .ifeqs "\inverse", "TRUE"
251
252 VSUB dYr2,dVr2,dVi6
253 VADD dYi2,dVi2,dVr6
254
255 VADD dYr6,dVr2,dVi6
256 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
257 VSUB dYi6,dVi2,dVr6
258
259 VSUB qU1,qX0,qX4
260 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
261
262 VSUB qU3,qX1,qX5
263 VSUB qU5,qX2,qX6
264 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
265
266 .ELSE
267
268 VADD dYr6,dVr2,dVi6
269 VSUB dYi6,dVi2,dVr6
270
271 VSUB dYr2,dVr2,dVi6
272 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
273 VADD dYi2,dVi2,dVr6
274
275
276 VSUB qU1,qX0,qX4
277 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
278 VSUB qU3,qX1,qX5
279 VSUB qU5,qX2,qX6
280 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
281
282
283 .ENDIF
284
285 @// finish first stage of 8 point FFT
286
287 VSUB qU7,qX3,qX7
288 VLD1 dT0[0], [t0]
289
290 @// finish second stage of 8 point FFT
291
292 VSUB dVr1,dUr1,dUi5
293 @// data[0] for next iteration
294 VLD2 {dXr0,dXi0},[pSrc :128],pointStep
295 VADD dVi1,dUi1,dUr5
296 VADD dVr3,dUr1,dUi5
297 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
298 VSUB dVi3,dUi1,dUr5
299
300 VSUB dVr5,dUr3,dUi7
301 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
302 VADD dVi5,dUi3,dUr7
303 VADD dVr7,dUr3,dUi7
304 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
305 VSUB dVi7,dUi3,dUr7
306
307 @// finish third stage of 8 point FFT
308
309 .ifeqs "\inverse", "TRUE"
310
311 @// calculate a*v5
312 VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
313
314 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
315 VMUL dVi5,dVi5,dT0[0]
316
317 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
318 VSUB dVr5,dT1,dVi5 @// a * V5
319 VADD dVi5,dT1,dVi5
320
321 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
322
323 @// calculate b*v7
324 VMUL dT1,dVr7,dT0[0]
325 VMUL dVi7,dVi7,dT0[0]
326
327 VADD qY1,qV1,qV5
328 VSUB qY5,qV1,qV5
329
330
331 VADD dVr7,dT1,dVi7 @// b * V7
332 VSUB dVi7,dVi7,dT1
333 SUB pDst, pDst, step2 @// set pDst to y1
334
335 @// On the last iteration, this will read past the end of pSrc,
336 @// so skip this read.
337 BEQ radix8SkipLastUpdateInv\name
338 VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
339 radix8SkipLastUpdateInv\name:
340
341 VSUB dYr3,dVr3,dVr7
342 VSUB dYi3,dVi3,dVi7
343 VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
344 VADD dYr7,dVr3,dVr7
345 VADD dYi7,dVi3,dVi7
346
347
348 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
349 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
350 VST2 {dYr7,dYi7},[pDst :128] @// store y7
351 ADD pDst, pDst, #16
352
353 .ELSE
354
355 @// calculate b*v7
356 VMUL dT1,dVr7,dT0[0]
357 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
358 VMUL dVi7,dVi7,dT0[0]
359
360 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
361 VADD dVr7,dT1,dVi7 @// b * V7
362 VSUB dVi7,dVi7,dT1
363
364 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
365
366 @// calculate a*v5
367 VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
368 VMUL dVi5,dVi5,dT0[0]
369
370 VADD dYr7,dVr3,dVr7
371 VADD dYi7,dVi3,dVi7
372 SUB pDst, pDst, step2 @// set pDst to y1
373
374 VSUB dVr5,dT1,dVi5 @// a * V5
375 VADD dVi5,dT1,dVi5
376
377 @// On the last iteration, this will read past the end of pSrc,
378 @// so skip this read.
379 BEQ radix8SkipLastUpdateFwd\name
380 VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
381 radix8SkipLastUpdateFwd\name:
382
383 VSUB qY5,qV1,qV5
384
385 VSUB dYr3,dVr3,dVr7
386 VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
387 VSUB dYi3,dVi3,dVi7
388 VADD qY1,qV1,qV5
389
390
391 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
392 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
393 VST2 {dYr1,dYi1},[pDst :128]! @// store y7
394
395 .ENDIF
396
397
398 @// update pDst for the next set
399 SUB pDst, pDst, step2
400 BGT radix8fsGrpZeroSetLoop\name
401
402
403 @// reset pSrc to pDst for the next stage
404 SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
405 MOV pDst,pPingPongBuf
406
407
408
409 .endm
410
411
412 @// Allocate stack memory required by the function
413
414
415 M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
416 FFTSTAGE "FALSE","FALSE",FWD
417 M_END
418
419
420 M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
421 FFTSTAGE "FALSE","TRUE",INV
422 M_END
423
424
425
426 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698