OLD | NEW |
---|---|
(Empty) | |
1 @// | |
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. | |
3 @// | |
4 @// Use of this source code is governed by a BSD-style license | |
5 @// that can be found in the LICENSE file in the root of the source | |
6 @// tree. An additional intellectual property rights grant can be found | |
7 @// in the file PATENTS. All contributing project authors may | |
8 @// be found in the AUTHORS file in the root of the source tree. | |
9 @// | |
10 @// This file was originally licensed as follows. It has been | |
11 @// relicensed with permission from the copyright holders. | |
12 | |
13 @// | |
14 @// | |
15 @// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s | |
16 @// OpenMAX DL: v1.0.2 | |
17 @// Last Modified Revision: 7761 | |
18 @// Last Modified Date: Wed, 26 Sep 2007 | |
19 @// | |
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. | |
21 @// | |
22 @// | |
23 @// | |
24 @// Description: | |
25 @// Compute a Radix 4 FFT stage for a N point complex signal | |
26 @// | |
27 @// | |
28 | |
29 | |
30 @// Include standard headers | |
31 | |
32 #include "dl/api/armCOMM_s.h" | |
33 #include "dl/api/omxtypes_s.h" | |
34 | |
35 | |
36 | |
37 @// Import symbols required from other files | |
38 @// (For example tables) | |
39 | |
40 | |
41 | |
42 | |
43 @// Set debugging level | |
44 @//DEBUG_ON SETL {TRUE} | |
45 | |
46 | |
47 @// Guarding implementation by the processor name | |
48 | |
49 | |
50 | |
51 @// Guarding implementation by the processor name | |
52 | |
53 | |
54 @// Import symbols required from other files | |
55 @// (For example tables) | |
56 | |
57 | |
58 @//Input Registers | |
59 | |
60 #define pSrc r0 | |
61 #define pDst r2 | |
62 #define pTwiddle r1 | |
63 #define subFFTNum r6 | |
64 #define subFFTSize r7 | |
65 | |
66 | |
67 | |
68 @//Output Registers | |
69 | |
70 | |
71 @//Local Scratch Registers | |
72 | |
73 #define grpCount r3 | |
74 #define pointStep r4 | |
75 #define outPointStep r5 | |
76 #define stepTwiddle r12 | |
77 #define setCount r14 | |
78 #define srcStep r8 | |
79 #define setStep r9 | |
80 #define dstStep r10 | |
81 #define twStep r11 | |
82 #define t1 r3 | |
83 | |
84 @// Neon Registers | |
85 | |
86 #define dW1 D0.S16 | |
87 #define dW2 D1.S16 | |
88 #define dW3 D2.S16 | |
89 | |
90 #define dXr0 D4.S16 | |
91 #define dXi0 D5.S16 | |
92 #define dXr1 D6.S16 | |
93 #define dXi1 D7.S16 | |
94 #define dXr2 D8.S16 | |
95 #define dXi2 D9.S16 | |
96 #define dXr3 D10.S16 | |
97 #define dXi3 D11.S16 | |
98 #define dYr0 D12.S16 | |
99 #define dYi0 D13.S16 | |
100 #define dYr1 D14.S16 | |
101 #define dYi1 D15.S16 | |
102 #define dYr2 D16.S16 | |
103 #define dYi2 D17.S16 | |
104 #define dYr3 D18.S16 | |
105 #define dYi3 D19.S16 | |
106 #define qT0 Q8.S32 | |
107 #define qT1 Q9.S32 | |
108 #define qT2 Q6.S32 | |
109 #define qT3 Q7.S32 | |
110 | |
111 #define dZr0 D20.S16 | |
112 #define dZi0 D21.S16 | |
113 #define dZr1 D22.S16 | |
114 #define dZi1 D23.S16 | |
115 #define dZr2 D24.S16 | |
116 #define dZi2 D25.S16 | |
117 #define dZr3 D26.S16 | |
118 #define dZi3 D27.S16 | |
119 #define qY0 Q6.S16 | |
120 #define qY1 Q7.S16 | |
121 #define qY2 Q8.S16 | |
122 #define qY3 Q9.S16 | |
123 #define qX0 Q2.S16 | |
124 #define qZ0 Q10.S16 | |
125 #define qZ1 Q11.S16 | |
126 #define qZ2 Q12.S16 | |
127 #define qZ3 Q13.S16 | |
128 | |
129 | |
130 .MACRO FFTSTAGE scaled, inverse , name | |
131 | |
132 @// Define stack arguments | |
133 | |
134 | |
135 @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs | |
136 | |
137 LSL grpCount,subFFTSize,#2 | |
138 LSR subFFTNum,subFFTNum,#2 | |
139 MOV subFFTSize,grpCount | |
140 | |
141 | |
142 @// pOut0+1 increments pOut0 by 4 bytes | |
143 @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes | |
144 | |
145 MOV stepTwiddle,#0 | |
146 SMULBB outPointStep,grpCount,subFFTNum | |
147 | |
148 @// pT0+1 increments pT0 by 4 bytes | |
149 @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes | |
150 | |
151 LSL pointStep,subFFTNum,#2 @// 2*grpSize | |
152 | |
153 VLD1 dW1,[pTwiddle :64] @//[wi | wr] | |
154 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*poin tStep | |
155 VLD1 dW2,[pTwiddle :64] @//[wi | wr] | |
156 ADD setStep,srcStep,pointStep @// setStep = 3*poin tStep | |
157 SUB srcStep,srcStep,#16 @// srcStep = 2*poin tStep-16 | |
158 VLD1 dW3,[pTwiddle :64] | |
159 @//RSB setStep,setStep,#16 @// setStep = - 3*po intStep+16 | |
160 RSB setStep,setStep,#0 @// setStep = - 3*po intStep | |
161 | |
162 MOV dstStep,outPointStep,LSL #1 | |
163 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outP ointStep | |
164 RSB dstStep,dstStep,#16 @// dstStep = - 3*ou tPointStep+16 | |
165 | |
166 | |
167 | |
168 grpLoop\name: | |
169 | |
170 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] | |
171 ADD stepTwiddle,stepTwiddle,pointStep | |
172 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] | |
173 ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point | |
174 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] | |
175 MOV twStep,stepTwiddle,LSL #2 | |
176 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc | |
177 | |
178 SUB twStep,stepTwiddle,twStep @// twStep = -3*ste pTwiddle | |
179 | |
180 | |
181 MOV setCount,pointStep,LSR #2 | |
182 ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set | |
183 ADD pSrc,pSrc,pointStep @// increment to data[1] o f the next set | |
184 | |
185 @// Loop on the sets : 4 at a time | |
186 | |
187 setLoop\name: | |
188 | |
189 SUBS setCount,setCount,#4 @// decrement the loop c ounter | |
190 | |
191 .ifeqs "\inverse", "TRUE" | |
192 VMULL qT0,dXr1,dW1[0] | |
193 VMLAL qT0,dXi1,dW1[1] @// real part | |
194 VMULL qT1,dXi1,dW1[0] | |
195 VMLSL qT1,dXr1,dW1[1] @// imag part | |
196 | |
197 .ELSE | |
198 VMULL qT0,dXr1,dW1[0] | |
199 VMLSL qT0,dXi1,dW1[1] @// real part | |
200 VMULL qT1,dXi1,dW1[0] | |
201 VMLAL qT1,dXr1,dW1[1] @// imag part | |
202 | |
203 .ENDIF | |
204 | |
205 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] | |
206 | |
207 .ifeqs "\inverse", "TRUE" | |
208 VMULL qT2,dXr2,dW2[0] | |
209 VMLAL qT2,dXi2,dW2[1] @// real part | |
210 VMULL qT3,dXi2,dW2[0] | |
211 VMLSL qT3,dXr2,dW2[1] @// imag part | |
212 | |
213 .ELSE | |
214 VMULL qT2,dXr2,dW2[0] | |
215 VMLSL qT2,dXi2,dW2[1] @// real part | |
216 VMULL qT3,dXi2,dW2[0] | |
217 VMLAL qT3,dXr2,dW2[1] @// imag part | |
218 | |
219 .ENDIF | |
220 | |
221 VRSHRN dZr1,qT0,#15 | |
222 VRSHRN dZi1,qT1,#15 | |
223 | |
224 | |
225 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] | |
226 | |
227 .ifeqs "\inverse", "TRUE" | |
228 VMULL qT0,dXr3,dW3[0] | |
229 VMLAL qT0,dXi3,dW3[1] @// real part | |
230 VMULL qT1,dXi3,dW3[0] | |
231 VMLSL qT1,dXr3,dW3[1] @// imag part | |
232 | |
233 .ELSE | |
234 VMULL qT0,dXr3,dW3[0] | |
235 VMLSL qT0,dXi3,dW3[1] @// real part | |
236 VMULL qT1,dXi3,dW3[0] | |
237 VMLAL qT1,dXr3,dW3[1] @// imag part | |
238 | |
239 .ENDIF | |
240 | |
241 VRSHRN dZr2,qT2,#15 | |
242 VRSHRN dZi2,qT3,#15 | |
243 | |
244 | |
245 VRSHRN dZr3,qT0,#15 | |
246 VRSHRN dZi3,qT1,#15 | |
247 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set | |
aedla
2013/06/26 20:24:46
16 byte OOB read when both loops are at their last
| |
248 | |
249 | |
250 .ifeqs "\scaled", "TRUE" | |
251 | |
252 @// finish first stage of 4 point FFT | |
253 VHADD qY0,qX0,qZ2 | |
254 VHSUB qY2,qX0,qZ2 | |
255 | |
256 VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] | |
257 VHADD qY1,qZ1,qZ3 | |
258 VHSUB qY3,qZ1,qZ3 | |
259 | |
260 | |
261 @// finish second stage of 4 point FFT | |
262 | |
263 .ifeqs "\inverse", "TRUE" | |
264 | |
265 VHSUB qZ0,qY2,qY1 | |
266 | |
267 VHADD dZr2,dYr0,dYi3 | |
268 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
269 VHSUB dZi2,dYi0,dYr3 | |
270 | |
271 VHADD qZ1,qY2,qY1 | |
272 VST2 {dZr2,dZi2},[pDst :128],outPointStep | |
273 | |
274 VHSUB dZr3,dYr0,dYi3 | |
275 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
276 VHADD dZi3,dYi0,dYr3 | |
277 VST2 {dZr3,dZi3},[pDst :128],dstStep | |
278 | |
279 | |
280 .ELSE | |
281 | |
282 VHSUB qZ0,qY2,qY1 | |
283 | |
284 VHSUB dZr3,dYr0,dYi3 | |
285 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
286 VHADD dZi3,dYi0,dYr3 | |
287 | |
288 VHADD qZ1,qY2,qY1 | |
289 VST2 {dZr3,dZi3},[pDst :128],outPointStep | |
290 | |
291 VHADD dZr2,dYr0,dYi3 | |
292 VHSUB dZi2,dYi0,dYr3 | |
293 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
294 VST2 {dZr2,dZi2},[pDst :128],dstStep | |
295 | |
296 | |
297 .ENDIF | |
298 | |
299 | |
300 .ELSE | |
301 | |
302 @// finish first stage of 4 point FFT | |
303 VADD qY0,qX0,qZ2 | |
304 VSUB qY2,qX0,qZ2 | |
305 | |
306 VLD2 {dXr0,dXi0},[pSrc]! @// data[0] | |
307 VADD qY1,qZ1,qZ3 | |
308 VSUB qY3,qZ1,qZ3 | |
309 | |
310 | |
311 @// finish second stage of 4 point FFT | |
312 | |
313 | |
314 .ifeqs "\inverse", "TRUE" | |
315 | |
316 VSUB qZ0,qY2,qY1 | |
317 | |
318 VADD dZr2,dYr0,dYi3 | |
319 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
320 VSUB dZi2,dYi0,dYr3 | |
321 | |
322 VADD qZ1,qY2,qY1 | |
323 VST2 {dZr2,dZi2},[pDst :128],outPointStep | |
324 | |
325 VSUB dZr3,dYr0,dYi3 | |
326 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
327 VADD dZi3,dYi0,dYr3 | |
328 VST2 {dZr3,dZi3},[pDst :128],dstStep | |
329 | |
330 | |
331 .ELSE | |
332 | |
333 VSUB qZ0,qY2,qY1 | |
334 | |
335 VSUB dZr3,dYr0,dYi3 | |
336 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
337 VADD dZi3,dYi0,dYr3 | |
338 | |
339 VADD qZ1,qY2,qY1 | |
340 VST2 {dZr3,dZi3},[pDst :128],outPointStep | |
341 | |
342 VADD dZr2,dYr0,dYi3 | |
343 VSUB dZi2,dYi0,dYr3 | |
344 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
345 VST2 {dZr2,dZi2},[pDst :128],dstStep | |
346 | |
347 | |
348 .ENDIF | |
349 | |
350 | |
351 | |
352 .ENDIF | |
353 | |
354 ADD pSrc,pSrc,pointStep @// increment to dat a[1] of the next set | |
355 BGT setLoop\name | |
356 | |
357 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr] | |
358 SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4 | |
359 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr] | |
360 ADD pSrc,pSrc,srcStep @// increment pSrc f or the next grp | |
361 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr] | |
aedla
2013/06/26 20:24:46
8 byte OOB read at the last iteration, coming from
| |
362 | |
363 | |
364 | |
365 BGT grpLoop\name | |
366 | |
367 | |
368 @// Reset and Swap pSrc and pDst for the next stage | |
369 MOV t1,pDst | |
370 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc - = 4*size bytes | |
371 SUB pSrc,t1,outPointStep | |
372 | |
373 | |
374 .endm | |
375 | |
376 | |
377 M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4 | |
378 FFTSTAGE "FALSE","FALSE",FWD | |
379 M_END | |
380 | |
381 | |
382 M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4 | |
383 FFTSTAGE "FALSE","TRUE",INV | |
384 M_END | |
385 | |
386 | |
387 M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4 | |
388 FFTSTAGE "TRUE","FALSE",FWDSFS | |
389 M_END | |
390 | |
391 | |
392 M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4 | |
393 FFTSTAGE "TRUE","TRUE",INVSFS | |
394 M_END | |
395 | |
396 | |
397 | |
398 | |
399 | |
400 .END | |
OLD | NEW |