Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(334)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This file was originally licensed as follows. It has been
11 @// relicensed with permission from the copyright holders.
12
13 @//
14 @//
15 @// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
16 @// OpenMAX DL: v1.0.2
17 @// Last Modified Revision: 7761
18 @// Last Modified Date: Wed, 26 Sep 2007
19 @//
20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21 @//
22 @//
23 @//
24 @// Description:
25 @// Compute a Radix 4 FFT stage for a N point complex signal
26 @//
27 @//
28
29
30 @// Include standard headers
31
32 #include "dl/api/armCOMM_s.h"
33 #include "dl/api/omxtypes_s.h"
34
35
36
37 @// Import symbols required from other files
38 @// (For example tables)
39
40
41
42
43 @// Set debugging level
44 @//DEBUG_ON SETL {TRUE}
45
46
47 @// Guarding implementation by the processor name
48
49
50
51 @// Guarding implementation by the processor name
52
53
54 @// Import symbols required from other files
55 @// (For example tables)
56
57
58 @//Input Registers
59
60 #define pSrc r0
61 #define pDst r2
62 #define pTwiddle r1
63 #define subFFTNum r6
64 #define subFFTSize r7
65
66
67
68 @//Output Registers
69
70
71 @//Local Scratch Registers
72
73 #define grpCount r3
74 #define pointStep r4
75 #define outPointStep r5
76 #define stepTwiddle r12
77 #define setCount r14
78 #define srcStep r8
79 #define setStep r9
80 #define dstStep r10
81 #define twStep r11
82 #define t1 r3
83
84 @// Neon Registers
85
86 #define dW1 D0.S16
87 #define dW2 D1.S16
88 #define dW3 D2.S16
89
90 #define dXr0 D4.S16
91 #define dXi0 D5.S16
92 #define dXr1 D6.S16
93 #define dXi1 D7.S16
94 #define dXr2 D8.S16
95 #define dXi2 D9.S16
96 #define dXr3 D10.S16
97 #define dXi3 D11.S16
98 #define dYr0 D12.S16
99 #define dYi0 D13.S16
100 #define dYr1 D14.S16
101 #define dYi1 D15.S16
102 #define dYr2 D16.S16
103 #define dYi2 D17.S16
104 #define dYr3 D18.S16
105 #define dYi3 D19.S16
106 #define qT0 Q8.S32
107 #define qT1 Q9.S32
108 #define qT2 Q6.S32
109 #define qT3 Q7.S32
110
111 #define dZr0 D20.S16
112 #define dZi0 D21.S16
113 #define dZr1 D22.S16
114 #define dZi1 D23.S16
115 #define dZr2 D24.S16
116 #define dZi2 D25.S16
117 #define dZr3 D26.S16
118 #define dZi3 D27.S16
119 #define qY0 Q6.S16
120 #define qY1 Q7.S16
121 #define qY2 Q8.S16
122 #define qY3 Q9.S16
123 #define qX0 Q2.S16
124 #define qZ0 Q10.S16
125 #define qZ1 Q11.S16
126 #define qZ2 Q12.S16
127 #define qZ3 Q13.S16
128
129
130 .MACRO FFTSTAGE scaled, inverse , name
131
132 @// Define stack arguments
133
134
135 @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
136
137 LSL grpCount,subFFTSize,#2
138 LSR subFFTNum,subFFTNum,#2
139 MOV subFFTSize,grpCount
140
141
142 @// pOut0+1 increments pOut0 by 4 bytes
143 @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
144
145 MOV stepTwiddle,#0
146 SMULBB outPointStep,grpCount,subFFTNum
147
148 @// pT0+1 increments pT0 by 4 bytes
149 @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
150
151 LSL pointStep,subFFTNum,#2 @// 2*grpSize
152
153 VLD1 dW1,[pTwiddle :64] @//[wi | wr]
154 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*poin tStep
155 VLD1 dW2,[pTwiddle :64] @//[wi | wr]
156 ADD setStep,srcStep,pointStep @// setStep = 3*poin tStep
157 SUB srcStep,srcStep,#16 @// srcStep = 2*poin tStep-16
158 VLD1 dW3,[pTwiddle :64]
159 @//RSB setStep,setStep,#16 @// setStep = - 3*po intStep+16
160 RSB setStep,setStep,#0 @// setStep = - 3*po intStep
161
162 MOV dstStep,outPointStep,LSL #1
163 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outP ointStep
164 RSB dstStep,dstStep,#16 @// dstStep = - 3*ou tPointStep+16
165
166
167
168 grpLoop\name:
169
170 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
171 ADD stepTwiddle,stepTwiddle,pointStep
172 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
173 ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point
174 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
175 MOV twStep,stepTwiddle,LSL #2
176 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc
177
178 SUB twStep,stepTwiddle,twStep @// twStep = -3*ste pTwiddle
179
180
181 MOV setCount,pointStep,LSR #2
182 ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set
183 ADD pSrc,pSrc,pointStep @// increment to data[1] o f the next set
184
185 @// Loop on the sets : 4 at a time
186
187 setLoop\name:
188
189 SUBS setCount,setCount,#4 @// decrement the loop c ounter
190
191 .ifeqs "\inverse", "TRUE"
192 VMULL qT0,dXr1,dW1[0]
193 VMLAL qT0,dXi1,dW1[1] @// real part
194 VMULL qT1,dXi1,dW1[0]
195 VMLSL qT1,dXr1,dW1[1] @// imag part
196
197 .ELSE
198 VMULL qT0,dXr1,dW1[0]
199 VMLSL qT0,dXi1,dW1[1] @// real part
200 VMULL qT1,dXi1,dW1[0]
201 VMLAL qT1,dXr1,dW1[1] @// imag part
202
203 .ENDIF
204
205 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
206
207 .ifeqs "\inverse", "TRUE"
208 VMULL qT2,dXr2,dW2[0]
209 VMLAL qT2,dXi2,dW2[1] @// real part
210 VMULL qT3,dXi2,dW2[0]
211 VMLSL qT3,dXr2,dW2[1] @// imag part
212
213 .ELSE
214 VMULL qT2,dXr2,dW2[0]
215 VMLSL qT2,dXi2,dW2[1] @// real part
216 VMULL qT3,dXi2,dW2[0]
217 VMLAL qT3,dXr2,dW2[1] @// imag part
218
219 .ENDIF
220
221 VRSHRN dZr1,qT0,#15
222 VRSHRN dZi1,qT1,#15
223
224
225 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
226
227 .ifeqs "\inverse", "TRUE"
228 VMULL qT0,dXr3,dW3[0]
229 VMLAL qT0,dXi3,dW3[1] @// real part
230 VMULL qT1,dXi3,dW3[0]
231 VMLSL qT1,dXr3,dW3[1] @// imag part
232
233 .ELSE
234 VMULL qT0,dXr3,dW3[0]
235 VMLSL qT0,dXi3,dW3[1] @// real part
236 VMULL qT1,dXi3,dW3[0]
237 VMLAL qT1,dXr3,dW3[1] @// imag part
238
239 .ENDIF
240
241 VRSHRN dZr2,qT2,#15
242 VRSHRN dZi2,qT3,#15
243
244
245 VRSHRN dZr3,qT0,#15
246 VRSHRN dZi3,qT1,#15
247 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
aedla 2013/06/26 20:24:46 16 byte OOB read when both loops are at their last
248
249
250 .ifeqs "\scaled", "TRUE"
251
252 @// finish first stage of 4 point FFT
253 VHADD qY0,qX0,qZ2
254 VHSUB qY2,qX0,qZ2
255
256 VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0]
257 VHADD qY1,qZ1,qZ3
258 VHSUB qY3,qZ1,qZ3
259
260
261 @// finish second stage of 4 point FFT
262
263 .ifeqs "\inverse", "TRUE"
264
265 VHSUB qZ0,qY2,qY1
266
267 VHADD dZr2,dYr0,dYi3
268 VST2 {dZr0,dZi0},[pDst :128],outPointStep
269 VHSUB dZi2,dYi0,dYr3
270
271 VHADD qZ1,qY2,qY1
272 VST2 {dZr2,dZi2},[pDst :128],outPointStep
273
274 VHSUB dZr3,dYr0,dYi3
275 VST2 {dZr1,dZi1},[pDst :128],outPointStep
276 VHADD dZi3,dYi0,dYr3
277 VST2 {dZr3,dZi3},[pDst :128],dstStep
278
279
280 .ELSE
281
282 VHSUB qZ0,qY2,qY1
283
284 VHSUB dZr3,dYr0,dYi3
285 VST2 {dZr0,dZi0},[pDst :128],outPointStep
286 VHADD dZi3,dYi0,dYr3
287
288 VHADD qZ1,qY2,qY1
289 VST2 {dZr3,dZi3},[pDst :128],outPointStep
290
291 VHADD dZr2,dYr0,dYi3
292 VHSUB dZi2,dYi0,dYr3
293 VST2 {dZr1,dZi1},[pDst :128],outPointStep
294 VST2 {dZr2,dZi2},[pDst :128],dstStep
295
296
297 .ENDIF
298
299
300 .ELSE
301
302 @// finish first stage of 4 point FFT
303 VADD qY0,qX0,qZ2
304 VSUB qY2,qX0,qZ2
305
306 VLD2 {dXr0,dXi0},[pSrc]! @// data[0]
307 VADD qY1,qZ1,qZ3
308 VSUB qY3,qZ1,qZ3
309
310
311 @// finish second stage of 4 point FFT
312
313
314 .ifeqs "\inverse", "TRUE"
315
316 VSUB qZ0,qY2,qY1
317
318 VADD dZr2,dYr0,dYi3
319 VST2 {dZr0,dZi0},[pDst :128],outPointStep
320 VSUB dZi2,dYi0,dYr3
321
322 VADD qZ1,qY2,qY1
323 VST2 {dZr2,dZi2},[pDst :128],outPointStep
324
325 VSUB dZr3,dYr0,dYi3
326 VST2 {dZr1,dZi1},[pDst :128],outPointStep
327 VADD dZi3,dYi0,dYr3
328 VST2 {dZr3,dZi3},[pDst :128],dstStep
329
330
331 .ELSE
332
333 VSUB qZ0,qY2,qY1
334
335 VSUB dZr3,dYr0,dYi3
336 VST2 {dZr0,dZi0},[pDst :128],outPointStep
337 VADD dZi3,dYi0,dYr3
338
339 VADD qZ1,qY2,qY1
340 VST2 {dZr3,dZi3},[pDst :128],outPointStep
341
342 VADD dZr2,dYr0,dYi3
343 VSUB dZi2,dYi0,dYr3
344 VST2 {dZr1,dZi1},[pDst :128],outPointStep
345 VST2 {dZr2,dZi2},[pDst :128],dstStep
346
347
348 .ENDIF
349
350
351
352 .ENDIF
353
354 ADD pSrc,pSrc,pointStep @// increment to dat a[1] of the next set
355 BGT setLoop\name
356
357 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
358 SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4
359 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
360 ADD pSrc,pSrc,srcStep @// increment pSrc f or the next grp
361 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
aedla 2013/06/26 20:24:46 8 byte OOB read at the last iteration, coming from
362
363
364
365 BGT grpLoop\name
366
367
368 @// Reset and Swap pSrc and pDst for the next stage
369 MOV t1,pDst
370 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc - = 4*size bytes
371 SUB pSrc,t1,outPointStep
372
373
374 .endm
375
376
377 M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
378 FFTSTAGE "FALSE","FALSE",FWD
379 M_END
380
381
382 M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
383 FFTSTAGE "FALSE","TRUE",INV
384 M_END
385
386
387 M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
388 FFTSTAGE "TRUE","FALSE",FWDSFS
389 M_END
390
391
392 M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
393 FFTSTAGE "TRUE","TRUE",INVSFS
394 M_END
395
396
397
398
399
400 .END
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698