OLD | NEW |
(Empty) | |
| 1 @// |
| 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
| 3 @// |
| 4 @// Use of this source code is governed by a BSD-style license |
| 5 @// that can be found in the LICENSE file in the root of the source |
| 6 @// tree. An additional intellectual property rights grant can be found |
| 7 @// in the file PATENTS. All contributing project authors may |
| 8 @// be found in the AUTHORS file in the root of the source tree. |
| 9 @// |
| 10 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s |
| 11 @// to support float instead of SC32. |
| 12 @// |
| 13 |
| 14 @// |
| 15 @// Description: |
| 16 @// Compute a Radix 4 FFT stage for a N point complex signal |
| 17 @// |
| 18 @// |
| 19 |
| 20 |
| 21 @// Include standard headers |
| 22 |
| 23 #include "dl/api/armCOMM_s.h" |
| 24 #include "dl/api/omxtypes_s.h" |
| 25 |
| 26 @// Import symbols required from other files |
| 27 @// (For example tables) |
| 28 |
| 29 |
| 30 |
| 31 |
| 32 @// Set debugging level |
| 33 @//DEBUG_ON SETL {TRUE} |
| 34 |
| 35 |
| 36 @// Guarding implementation by the processor name |
| 37 |
| 38 |
| 39 @// Import symbols required from other files |
| 40 @// (For example tables) |
| 41 @//IMPORT armAAC_constTable |
| 42 |
| 43 @//Input Registers |
| 44 |
| 45 #define pSrc r0 |
| 46 #define pDst r2 |
| 47 #define pTwiddle r1 |
| 48 #define subFFTNum r6 |
| 49 #define subFFTSize r7 |
| 50 |
| 51 |
| 52 |
| 53 @//Output Registers |
| 54 |
| 55 |
| 56 @//Local Scratch Registers |
| 57 |
| 58 #define outPointStep r3 |
| 59 #define grpCount r4 |
| 60 #define dstStep r5 |
| 61 #define grpTwStep r8 |
| 62 #define stepTwiddle r9 |
| 63 #define twStep r10 |
| 64 #define pTmp r4 |
| 65 #define step16 r11 |
| 66 #define step24 r12 |
| 67 |
| 68 |
| 69 @// Neon Registers |
| 70 |
| 71 #define dButterfly1Real02 D0.F32 |
| 72 #define dButterfly1Imag02 D1.F32 |
| 73 #define dButterfly1Real13 D2.F32 |
| 74 #define dButterfly1Imag13 D3.F32 |
| 75 #define dButterfly2Real02 D4.F32 |
| 76 #define dButterfly2Imag02 D5.F32 |
| 77 #define dButterfly2Real13 D6.F32 |
| 78 #define dButterfly2Imag13 D7.F32 |
| 79 #define dXr0 D0.F32 |
| 80 #define dXi0 D1.F32 |
| 81 #define dXr1 D2.F32 |
| 82 #define dXi1 D3.F32 |
| 83 #define dXr2 D4.F32 |
| 84 #define dXi2 D5.F32 |
| 85 #define dXr3 D6.F32 |
| 86 #define dXi3 D7.F32 |
| 87 |
| 88 #define dYr0 D16.F32 |
| 89 #define dYi0 D17.F32 |
| 90 #define dYr1 D18.F32 |
| 91 #define dYi1 D19.F32 |
| 92 #define dYr2 D20.F32 |
| 93 #define dYi2 D21.F32 |
| 94 #define dYr3 D22.F32 |
| 95 #define dYi3 D23.F32 |
| 96 |
| 97 #define dW1r D8.F32 |
| 98 #define dW1i D9.F32 |
| 99 #define dW2r D10.F32 |
| 100 #define dW2i D11.F32 |
| 101 #define dW3r D12.F32 |
| 102 #define dW3i D13.F32 |
| 103 #define qT0 d14.f32 |
| 104 #define qT1 d16.F32 |
| 105 #define qT2 d18.F32 |
| 106 #define qT3 d20.f32 |
| 107 #define qT4 d22.f32 |
| 108 #define qT5 d24.f32 |
| 109 |
| 110 #define dZr0 D14.F32 |
| 111 #define dZi0 D15.F32 |
| 112 #define dZr1 D26.F32 |
| 113 #define dZi1 D27.F32 |
| 114 #define dZr2 D28.F32 |
| 115 #define dZi2 D29.F32 |
| 116 #define dZr3 D30.F32 |
| 117 #define dZi3 D31.F32 |
| 118 |
| 119 #define qX0 Q0.F32 |
| 120 #define qY0 Q8.F32 |
| 121 #define qY1 Q9.F32 |
| 122 #define qY2 Q10.F32 |
| 123 #define qY3 Q11.F32 |
| 124 #define qZ0 Q7.F32 |
| 125 #define qZ1 Q13.F32 |
| 126 #define qZ2 Q14.F32 |
| 127 #define qZ3 Q15.F32 |
| 128 |
| 129 |
| 130 |
| 131 .MACRO FFTSTAGE scaled, inverse , name |
| 132 |
| 133 @// Define stack arguments |
| 134 |
| 135 |
| 136 @// pOut0+1 increments pOut0 by 8 bytes |
| 137 @// pOut0+outPointStep == increment of 8*outPointStep bytes |
| 138 MOV outPointStep,subFFTSize,LSL #3 |
| 139 |
| 140 @// Update grpCount and grpSize rightaway |
| 141 |
| 142 VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr] |
| 143 MOV step16,#16 |
| 144 LSL grpCount,subFFTSize,#2 |
| 145 |
| 146 VLD1 dW2r,[pTwiddle :64] @// [wi|wr] |
| 147 MOV subFFTNum,#1 @//after the last stage |
| 148 |
| 149 VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr] |
| 150 MOV stepTwiddle,#0 |
| 151 |
| 152 VLD1 dW2i,[pTwiddle :64]! @// [wi|wr] |
| 153 SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to st
art with |
| 154 |
| 155 @// update subFFTSize for the next stage |
| 156 MOV subFFTSize,grpCount |
| 157 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr] |
| 158 MOV dstStep,outPointStep,LSL #1 |
| 159 |
| 160 @// AC.r AC.i BD.r BD.i |
| 161 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterf
ly1Imag13},[pSrc :256]! |
| 162 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPoint
Step |
| 163 RSB dstStep,dstStep,#16 @// dstStep = - 3*outPoi
ntStep+16 |
| 164 MOV step24,#24 |
| 165 |
| 166 @// AC.r AC.i BD.r BD.i |
| 167 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterf
ly2Imag13},[pSrc :256]! |
| 168 |
| 169 |
| 170 @// Process two groups at a time |
| 171 |
| 172 radix4lsGrpLoop\name : |
| 173 |
| 174 VZIP dW2r,dW2i |
| 175 ADD stepTwiddle,stepTwiddle,#16 |
| 176 VZIP dW3r,dW3i |
| 177 ADD grpTwStep,stepTwiddle,#4 |
| 178 VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r |
| 179 SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle |
| 180 VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i |
| 181 MOV grpTwStep,grpTwStep,LSL #1 |
| 182 VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r |
| 183 RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle |
| 184 |
| 185 |
| 186 VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i |
| 187 |
| 188 |
| 189 @// grpCount is multiplied by 4 |
| 190 SUBS grpCount,grpCount,#8 |
| 191 |
| 192 .ifeqs "\inverse", "TRUE" |
| 193 VMUL dZr1,dW1r,dXr1 |
| 194 VMLA dZr1,dW1i,dXi1 @// real part |
| 195 VMUL dZi1,dW1r,dXi1 |
| 196 VMLS dZi1,dW1i,dXr1 @// imag part |
| 197 |
| 198 .else |
| 199 |
| 200 VMUL dZr1,dW1r,dXr1 |
| 201 VMLS dZr1,dW1i,dXi1 @// real part |
| 202 VMUL dZi1,dW1r,dXi1 |
| 203 VMLA dZi1,dW1i,dXr1 @// imag part |
| 204 |
| 205 .endif |
| 206 |
| 207 VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr] |
| 208 |
| 209 .ifeqs "\inverse", "TRUE" |
| 210 VMUL dZr2,dW2r,dXr2 |
| 211 VMLA dZr2,dW2i,dXi2 @// real part |
| 212 VMUL dZi2,dW2r,dXi2 |
| 213 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr] |
| 214 VMLS dZi2,dW2i,dXr2 @// imag part |
| 215 |
| 216 .else |
| 217 |
| 218 VMUL dZr2,dW2r,dXr2 |
| 219 VMLS dZr2,dW2i,dXi2 @// real part |
| 220 VMUL dZi2,dW2r,dXi2 |
| 221 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr] |
| 222 VMLA dZi2,dW2i,dXr2 @// imag part |
| 223 |
| 224 .endif |
| 225 |
| 226 |
| 227 VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr] |
| 228 |
| 229 @// move qX0 so as to load for the next iteration |
| 230 VMOV qZ0,qX0 |
| 231 |
| 232 .ifeqs "\inverse", "TRUE" |
| 233 VMUL dZr3,dW3r,dXr3 |
| 234 VMLA dZr3,dW3i,dXi3 @// real part |
| 235 VMUL dZi3,dW3r,dXi3 |
| 236 VLD1 dW3r,[pTwiddle :64],step24 |
| 237 VMLS dZi3,dW3i,dXr3 @// imag part |
| 238 |
| 239 .else |
| 240 |
| 241 VMUL dZr3,dW3r,dXr3 |
| 242 VMLS dZr3,dW3i,dXi3 @// real part |
| 243 VMUL dZi3,dW3r,dXi3 |
| 244 VLD1 dW3r,[pTwiddle :64],step24 |
| 245 VMLA dZi3,dW3i,dXr3 @// imag part |
| 246 |
| 247 .endif |
| 248 |
| 249 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr] |
| 250 |
| 251 @// Don't do the load on the last iteration so we don't read past the en
d |
| 252 @// of pSrc. |
| 253 addeq pSrc, pSrc, #64 |
| 254 beq radix4lsSkipRead\name |
| 255 @// AC.r AC.i BD.r BD.i |
| 256 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterf
ly1Imag13},[pSrc :256]! |
| 257 |
| 258 @// AC.r AC.i BD.r BD.i |
| 259 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterf
ly2Imag13},[pSrc :256]! |
| 260 radix4lsSkipRead\name: |
| 261 |
| 262 @// finish first stage of 4 point FFT |
| 263 |
| 264 VADD qY0,qZ0,qZ2 |
| 265 VSUB qY2,qZ0,qZ2 |
| 266 VADD qY1,qZ1,qZ3 |
| 267 VSUB qY3,qZ1,qZ3 |
| 268 |
| 269 |
| 270 @// finish second stage of 4 point FFT |
| 271 |
| 272 .ifeqs "\inverse", "TRUE" |
| 273 |
| 274 VSUB qZ0,qY2,qY1 |
| 275 |
| 276 VADD dZr3,dYr0,dYi3 |
| 277 VST2 {dZr0,dZi0},[pDst :128],outPointStep |
| 278 VSUB dZi3,dYi0,dYr3 |
| 279 |
| 280 VADD qZ2,qY2,qY1 |
| 281 VST2 {dZr3,dZi3},[pDst :128],outPointStep |
| 282 |
| 283 VSUB dZr1,dYr0,dYi3 |
| 284 VST2 {dZr2,dZi2},[pDst :128],outPointStep |
| 285 VADD dZi1,dYi0,dYr3 |
| 286 |
| 287 @// dstStep = -outPointStep + 16 |
| 288 VST2 {dZr1,dZi1},[pDst :128],dstStep |
| 289 |
| 290 |
| 291 .else |
| 292 |
| 293 VSUB qZ0,qY2,qY1 |
| 294 |
| 295 VSUB dZr1,dYr0,dYi3 |
| 296 VST2 {dZr0,dZi0},[pDst :128],outPointStep |
| 297 VADD dZi1,dYi0,dYr3 |
| 298 |
| 299 VADD qZ2,qY2,qY1 |
| 300 VST2 {dZr1,dZi1},[pDst :128],outPointStep |
| 301 |
| 302 VADD dZr3,dYr0,dYi3 |
| 303 VST2 {dZr2,dZi2},[pDst :128],outPointStep |
| 304 VSUB dZi3,dYi0,dYr3 |
| 305 |
| 306 @// dstStep = -outPointStep + 16 |
| 307 VST2 {dZr3,dZi3},[pDst :128],dstStep |
| 308 |
| 309 |
| 310 .endif |
| 311 |
| 312 BGT radix4lsGrpLoop\name |
| 313 |
| 314 |
| 315 @// Reset and Swap pSrc and pDst for the next stage |
| 316 MOV pTmp,pDst |
| 317 @// Extra increment done in final iteration of the loop |
| 318 SUB pSrc,pSrc,#64 |
| 319 @// pDst -= 4*size; pSrc -= 8*size bytes |
| 320 SUB pDst,pSrc,outPointStep,LSL #2 |
| 321 SUB pSrc,pTmp,outPointStep |
| 322 SUB pTwiddle,pTwiddle,subFFTSize,LSL #1 |
| 323 @// Extra increment done in final iteration of the loop |
| 324 SUB pTwiddle,pTwiddle,#16 |
| 325 |
| 326 .endm |
| 327 |
| 328 |
| 329 M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4 |
| 330 FFTSTAGE "FALSE","FALSE",fwd |
| 331 M_END |
| 332 |
| 333 |
| 334 M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4 |
| 335 FFTSTAGE "FALSE","TRUE",inv |
| 336 M_END |
| 337 |
| 338 |
| 339 .end |
OLD | NEW |