Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(722)

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 @//
2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 @//
4 @// Use of this source code is governed by a BSD-style license
5 @// that can be found in the LICENSE file in the root of the source
6 @// tree. An additional intellectual property rights grant can be found
7 @// in the file PATENTS. All contributing project authors may
8 @// be found in the AUTHORS file in the root of the source tree.
9 @//
10 @// This is a modification of
11 @// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
12 @// instead of SC32.
13 @//
14
15 @//
16 @// Description:
17 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
18 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
19 @//
20 @//
21
22
23 @// Include standard headers
24
25 #include "dl/api/armCOMM_s.h"
26 #include "dl/api/omxtypes_s.h"
27
28
29 @// Import symbols required from other files
30 @// (For example tables)
31
32
33 @// Set debugging level
34 @//DEBUG_ON SETL {TRUE}
35
36
37
38 @// Guarding implementation by the processor name
39
40
41
42 @// Guarding implementation by the processor name
43
44
45
46 @//Input Registers
47
48 #define pSrc r0
49 #define pDst r1
50 #define pFFTSpec r2
51 #define scale r3
52
53
54 @// Output registers
55 #define result r0
56
57 @//Local Scratch Registers
58
59 #define argTwiddle r1
60 #define argDst r2
61 #define argScale r4
62 #define tmpOrder r4
63 #define pTwiddle r4
64 #define pOut r5
65 #define subFFTSize r7
66 #define subFFTNum r6
67 #define N r6
68 #define order r14
69 #define diff r9
70 @// Total num of radix stages required to complete the FFT
71 #define count r8
72 #define x0r r4
73 #define x0i r5
74 #define diffMinusOne r2
75 #define round r3
76
77 #define pOut1 r2
78 #define size r7
79 #define step r8
80 #define step1 r9
81 #define twStep r10
82 #define pTwiddleTmp r11
83 #define argTwiddle1 r12
84 #define zero r14
85
86 @// Neon registers
87
88 #define dX0 D0.F32
89 #define dShift D1.F32
90 #define dX1 D1.F32
91 #define dY0 D2.F32
92 #define dY1 D3.F32
93 #define dX0r D0.F32
94 #define dX0i D1.F32
95 #define dX1r D2.F32
96 #define dX1i D3.F32
97 #define dW0r D4.F32
98 #define dW0i D5.F32
99 #define dW1r D6.F32
100 #define dW1i D7.F32
101 #define dT0 D8.F32
102 #define dT1 D9.F32
103 #define dT2 D10.F32
104 #define dT3 D11.F32
105 #define qT0 D12.F32
106 #define qT1 D14.F32
107 #define qT2 D16.F32
108 #define qT3 D18.F32
109 #define dY0r D4.F32
110 #define dY0i D5.F32
111 #define dY1r D6.F32
112 #define dY1i D7.F32
113
114 #define dY2 D4.F32
115 #define dY3 D5.F32
116 #define dW0 D6.F32
117 #define dW1 D7.F32
118 #define dW0Tmp D10.F32
119 #define dW1Neg D11.F32
120
121 #define half D13.F32
122
123 @ Structure offsets for the FFTSpec
124 .set ARMsFFTSpec_N, 0
125 .set ARMsFFTSpec_pBitRev, 4
126 .set ARMsFFTSpec_pTwiddle, 8
127 .set ARMsFFTSpec_pBuf, 12
128
129 .MACRO FFTSTAGE scaled, inverse, name
130
131 @// Read the size from structure and take log
132 LDR N, [pFFTSpec, #ARMsFFTSpec_N]
133
134 @// Read other structure parameters
135 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
136 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
137
138 VMOV half, 0.5
139
140
141 MOV size,N,ASR #1 @// preserve the contents of N
142 MOV step,N,LSL #2 @// step = N/2 * 8 bytes
143
144
145 @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
146 @// Note: W^(k) is stored as negated value and also need to
147 @// conjugate the values from the table
148
149 @// Z(0) : no need of twiddle multiply
150 @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
151
152 VLD1 dX0,[pSrc],step
153 ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes
154
155 VLD1 dX1,[pSrc]!
156 @// twStep = 3N/8 * 8 bytes pointing to W^1
157 SUB twStep,step,size,LSL #1
158
159 MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes
160 SUB step1,step1,#8 @// (N/4-1)*8 bytes
161
162 VADD dY0,dX0,dX1 @// [b+d | a+c]
163 VSUB dY1,dX0,dX1 @// [b-d | a-c]
164 VMUL dY0, dY0, half[0]
165 VMUL dY1, dY1, half[0]
166
167 @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
168 VZIP dY0,dY1
169
170 VSUB dX0,dY0,dY1
171 SUBS size,size,#2
172 VADD dX1,dY0,dY1
173
174 SUB pSrc,pSrc,step
175
176 VST1 dX0[0],[pOut1]!
177 ADD pTwiddleTmp,pTwiddle,#8 @// W^2
178 VST1 dX1[1],[pOut1]!
179 ADD argTwiddle1,pTwiddle,twStep @// W^1
180
181
182 BLT decrementScale\name
183 BEQ lastElement\name
184
185
186 @// Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
187 @// Note: W^k is stored as negative values in the table and also
188 @// need to conjugate the values from the table.
189 @//
190 @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
191 @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
192
193
194 SUB step,step,#24
195 evenOddButterflyLoop\name :
196
197
198 VLD1 dW0r,[argTwiddle1],step1
199 VLD1 dW1r,[argTwiddle1]!
200
201 VLD2 {dX0r,dX0i},[pSrc],step
202 SUB argTwiddle1,argTwiddle1,step1
203 VLD2 {dX1r,dX1i},[pSrc]!
204
205 SUB step1,step1,#8 @// (N/4-2)*8 bytes
206 VLD1 dW0i,[pTwiddleTmp],step1
207 VLD1 dW1i,[pTwiddleTmp]!
208 SUB pSrc,pSrc,step
209
210 SUB pTwiddleTmp,pTwiddleTmp,step1
211 VREV64 dX1r,dX1r
212 VREV64 dX1i,dX1i
213 SUBS size,size,#4
214
215
216 VSUB dT2,dX0r,dX1r @// a-c
217 VADD dT3,dX0i,dX1i @// b+d
218 VADD dT0,dX0r,dX1r @// a+c
219 VSUB dT1,dX0i,dX1i @// b-d
220 SUB step1,step1,#8
221
222 VMUL dT2, dT2, half[0]
223 VMUL dT3, dT3, half[0]
224
225 VMUL dT0, dT0, half[0]
226 VMUL dT1, dT1, half[0]
227
228 VZIP dW1r,dW1i
229 VZIP dW0r,dW0i
230
231
232 VMUL dX1r,dW1r,dT2
233 VMUL dX1i,dW1r,dT3
234 VMUL dX0r,dW0r,dT2
235 VMUL dX0i,dW0r,dT3
236
237 VMLS dX1r,dW1i,dT3
238 VMLA dX1i,dW1i,dT2
239
240 VMLA dX0r,dW0i,dT3
241 VMLS dX0i,dW0i,dT2
242
243
244 VADD dY1r,dT0,dX1i @// F(N/2 -1)
245 VSUB dY1i,dX1r,dT1
246
247 VREV64 dY1r,dY1r
248 VREV64 dY1i,dY1i
249
250
251 VADD dY0r,dT0,dX0i @// F(1)
252 VSUB dY0i,dT1,dX0r
253
254
255 VST2 {dY0r,dY0i},[pOut1],step
256 VST2 {dY1r,dY1i},[pOut1]!
257 SUB pOut1,pOut1,step
258 SUB step,step,#32 @// (N/2-4)*8 bytes
259
260
261 BGT evenOddButterflyLoop\name
262
263
264 @// set both the ptrs to the last element
265 SUB pSrc,pSrc,#8
266 SUB pOut1,pOut1,#8
267
268 @// Last element can be expanded as follows
269 @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
270 @// -ve)
271 @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
272 @// 1/2[2a+j0] - j (c-jd) [0+j2b]
273 @// (a+bc, -bd)
274 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
275
276 lastElement\name :
277 VLD1 dX0r,[pSrc]
278
279 VST1 dX0r[0],[pOut1]!
280 VNEG dX0r,dX0r
281 VST1 dX0r[1],[pOut1]
282
283
284
285 decrementScale\name :
286
287 .endm
288
289 M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4
290
291 FFTSTAGE "FALSE","TRUE",Inv
292 M_END
293
294 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698