OLD | NEW |
(Empty) | |
| 1 @// |
| 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. |
| 3 @// |
| 4 @// Use of this source code is governed by a BSD-style license |
| 5 @// that can be found in the LICENSE file in the root of the source |
| 6 @// tree. An additional intellectual property rights grant can be found |
| 7 @// in the file PATENTS. All contributing project authors may |
| 8 @// be found in the AUTHORS file in the root of the source tree. |
| 9 @// |
| 10 @// This file was originally licensed as follows. It has been |
| 11 @// relicensed with permission from the copyright holders. |
| 12 @// |
| 13 |
| 14 @// |
| 15 @// File Name: armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s |
| 16 @// OpenMAX DL: v1.0.2 |
| 17 @// Last Modified Revision: 7767 |
| 18 @// Last Modified Date: Thu, 27 Sep 2007 |
| 19 @// |
| 20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. |
| 21 @// |
| 22 @// |
| 23 @// |
| 24 @// Description: |
| 25 @// Compute a Radix 4 FFT stage for a N point complex signal |
| 26 @// |
| 27 |
| 28 |
| 29 @// Include standard headers |
| 30 |
| 31 #include "dl/api/armCOMM_s.h" |
| 32 #include "dl/api/omxtypes_s.h" |
| 33 |
| 34 @// Import symbols required from other files |
| 35 @// (For example tables) |
| 36 |
| 37 |
| 38 |
| 39 |
| 40 @// Set debugging level |
| 41 @//DEBUG_ON SETL {TRUE} |
| 42 |
| 43 |
| 44 @// Guarding implementation by the processor name |
| 45 |
| 46 |
| 47 @// Import symbols required from other files |
| 48 @// (For example tables) |
| 49 @//IMPORT armAAC_constTable |
| 50 |
| 51 @//Input Registers |
| 52 |
| 53 #define pSrc r0 |
| 54 #define pDst r2 |
| 55 #define pTwiddle r1 |
| 56 #define subFFTNum r6 |
| 57 #define subFFTSize r7 |
| 58 |
| 59 |
| 60 |
| 61 @//Output Registers |
| 62 |
| 63 |
| 64 @//Local Scratch Registers |
| 65 |
| 66 #define outPointStep r3 |
| 67 #define grpCount r4 |
| 68 #define dstStep r5 |
| 69 #define grpTwStep r8 |
| 70 #define stepTwiddle r9 |
| 71 #define twStep r10 |
| 72 #define pTmp r4 |
| 73 #define step16 r11 |
| 74 #define step24 r12 |
| 75 |
| 76 |
| 77 @// Neon Registers |
| 78 |
| 79 #define dButterfly1Real02 D0.S32 |
| 80 #define dButterfly1Imag02 D1.S32 |
| 81 #define dButterfly1Real13 D2.S32 |
| 82 #define dButterfly1Imag13 D3.S32 |
| 83 #define dButterfly2Real02 D4.S32 |
| 84 #define dButterfly2Imag02 D5.S32 |
| 85 #define dButterfly2Real13 D6.S32 |
| 86 #define dButterfly2Imag13 D7.S32 |
| 87 #define dXr0 D0.S32 |
| 88 #define dXi0 D1.S32 |
| 89 #define dXr1 D2.S32 |
| 90 #define dXi1 D3.S32 |
| 91 #define dXr2 D4.S32 |
| 92 #define dXi2 D5.S32 |
| 93 #define dXr3 D6.S32 |
| 94 #define dXi3 D7.S32 |
| 95 |
| 96 #define dYr0 D16.S32 |
| 97 #define dYi0 D17.S32 |
| 98 #define dYr1 D18.S32 |
| 99 #define dYi1 D19.S32 |
| 100 #define dYr2 D20.S32 |
| 101 #define dYi2 D21.S32 |
| 102 #define dYr3 D22.S32 |
| 103 #define dYi3 D23.S32 |
| 104 |
| 105 #define dW1r D8.S32 |
| 106 #define dW1i D9.S32 |
| 107 #define dW2r D10.S32 |
| 108 #define dW2i D11.S32 |
| 109 #define dW3r D12.S32 |
| 110 #define dW3i D13.S32 |
| 111 #define qT0 Q7.S64 |
| 112 #define qT1 Q8.S64 |
| 113 #define qT2 Q9.S64 |
| 114 #define qT3 Q10.S64 |
| 115 #define qT4 Q11.S64 |
| 116 #define qT5 Q12.S64 |
| 117 |
| 118 #define dZr0 D14.S32 |
| 119 #define dZi0 D15.S32 |
| 120 #define dZr1 D26.S32 |
| 121 #define dZi1 D27.S32 |
| 122 #define dZr2 D28.S32 |
| 123 #define dZi2 D29.S32 |
| 124 #define dZr3 D30.S32 |
| 125 #define dZi3 D31.S32 |
| 126 |
| 127 #define qX0 Q0.S32 |
| 128 #define qY0 Q8.S32 |
| 129 #define qY1 Q9.S32 |
| 130 #define qY2 Q10.S32 |
| 131 #define qY3 Q11.S32 |
| 132 #define qZ0 Q7.S32 |
| 133 #define qZ1 Q13.S32 |
| 134 #define qZ2 Q14.S32 |
| 135 #define qZ3 Q15.S32 |
| 136 |
| 137 |
| 138 |
| 139 .MACRO FFTSTAGE scaled, inverse , name |
| 140 |
| 141 @// Define stack arguments |
| 142 |
| 143 |
| 144 @// pOut0+1 increments pOut0 by 8 bytes |
| 145 @// pOut0+outPointStep == increment of 8*outPointStep bytes |
| 146 MOV outPointStep,subFFTSize,LSL #3 |
| 147 |
| 148 @// Update grpCount and grpSize rightaway |
| 149 |
| 150 VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr] |
| 151 MOV step16,#16 |
| 152 LSL grpCount,subFFTSize,#2 |
| 153 |
| 154 VLD1 dW2r,[pTwiddle :64] @// [wi|wr] |
| 155 MOV subFFTNum,#1 @//after the last stage |
| 156 |
| 157 VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr] |
| 158 MOV stepTwiddle,#0 |
| 159 |
| 160 VLD1 dW2i,[pTwiddle :64]! @// [wi|wr] |
| 161 SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 t
o start with |
| 162 |
| 163 @// update subFFTSize for the next stage |
| 164 MOV subFFTSize,grpCount |
| 165 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|
wr] |
| 166 MOV dstStep,outPointStep,LSL #1 |
| 167 |
| 168 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterf
ly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i |
| 169 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outP
ointStep |
| 170 RSB dstStep,dstStep,#16 @// dstStep = - 3*ou
tPointStep+16 |
| 171 MOV step24,#24 |
| 172 |
| 173 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterf
ly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i |
| 174 |
| 175 |
| 176 @// Process two groups at a time |
| 177 |
| 178 grpLoop\name : |
| 179 |
| 180 VZIP dW2r,dW2i |
| 181 ADD stepTwiddle,stepTwiddle,#16 @// increment for th
e next iteration |
| 182 VZIP dW3r,dW3i |
| 183 ADD grpTwStep,stepTwiddle,#4 |
| 184 VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r |
| 185 SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle |
| 186 VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i |
| 187 MOV grpTwStep,grpTwStep,LSL #1 |
| 188 VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r |
| 189 RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle |
| 190 |
| 191 |
| 192 VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i |
| 193 |
| 194 |
| 195 SUBS grpCount,grpCount,#8 @// grpCount is multipli
ed by 4 |
| 196 |
| 197 .ifeqs "\inverse", "TRUE" |
| 198 VMULL qT0,dW1r,dXr1 |
| 199 VMLAL qT0,dW1i,dXi1 @// real part |
| 200 VMULL qT1,dW1r,dXi1 |
| 201 VMLSL qT1,dW1i,dXr1 @// imag part |
| 202 |
| 203 .else |
| 204 |
| 205 VMULL qT0,dW1r,dXr1 |
| 206 VMLSL qT0,dW1i,dXi1 @// real part |
| 207 VMULL qT1,dW1r,dXi1 |
| 208 VMLAL qT1,dW1i,dXr1 @// imag part |
| 209 |
| 210 .endif |
| 211 |
| 212 VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr] |
| 213 |
| 214 .ifeqs "\inverse", "TRUE" |
| 215 VMULL qT2,dW2r,dXr2 |
| 216 VMLAL qT2,dW2i,dXi2 @// real part |
| 217 VMULL qT3,dW2r,dXi2 |
| 218 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr] |
| 219 VMLSL qT3,dW2i,dXr2 @// imag part |
| 220 |
| 221 .else |
| 222 |
| 223 VMULL qT2,dW2r,dXr2 |
| 224 VMLSL qT2,dW2i,dXi2 @// real part |
| 225 VMULL qT3,dW2r,dXi2 |
| 226 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr] |
| 227 VMLAL qT3,dW2i,dXr2 @// imag part |
| 228 |
| 229 .endif |
| 230 |
| 231 |
| 232 VRSHRN dZr1,qT0,#31 |
| 233 VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr] |
| 234 VRSHRN dZi1,qT1,#31 |
| 235 |
| 236 VMOV qZ0,qX0 @// move qX0 so as to lo
ad for the next iteration |
| 237 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterf
ly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i |
| 238 |
| 239 |
| 240 .ifeqs "\inverse", "TRUE" |
| 241 VMULL qT4,dW3r,dXr3 |
| 242 VMLAL qT4,dW3i,dXi3 @// real part |
| 243 VMULL qT5,dW3r,dXi3 |
| 244 VLD1 dW3r,[pTwiddle :64],step24 |
| 245 VMLSL qT5,dW3i,dXr3 @// imag part |
| 246 |
| 247 .else |
| 248 |
| 249 VMULL qT4,dW3r,dXr3 |
| 250 VMLSL qT4,dW3i,dXi3 @// real part |
| 251 VMULL qT5,dW3r,dXi3 |
| 252 VLD1 dW3r,[pTwiddle :64],step24 |
| 253 VMLAL qT5,dW3i,dXr3 @// imag part |
| 254 |
| 255 .endif |
| 256 |
| 257 VRSHRN dZr2,qT2,#31 |
| 258 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|
wr] |
| 259 VRSHRN dZi2,qT3,#31 |
| 260 |
| 261 VRSHRN dZr3,qT4,#31 |
| 262 VRSHRN dZi3,qT5,#31 |
| 263 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterf
ly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i |
| 264 |
| 265 |
| 266 .ifeqs "\scaled", "TRUE" |
| 267 |
| 268 @// finish first stage of 4 point FFT |
| 269 |
| 270 VHADD qY0,qZ0,qZ2 |
| 271 VHSUB qY2,qZ0,qZ2 |
| 272 VHADD qY1,qZ1,qZ3 |
| 273 VHSUB qY3,qZ1,qZ3 |
| 274 |
| 275 |
| 276 @// finish second stage of 4 point FFT |
| 277 |
| 278 .ifeqs "\inverse", "TRUE" |
| 279 |
| 280 VHSUB qZ0,qY2,qY1 |
| 281 |
| 282 VHADD dZr3,dYr0,dYi3 |
| 283 VST2 {dZr0,dZi0},[pDst :128],outPointStep |
| 284 VHSUB dZi3,dYi0,dYr3 |
| 285 |
| 286 VHADD qZ2,qY2,qY1 |
| 287 VST2 {dZr3,dZi3},[pDst :128],outPointStep |
| 288 |
| 289 VHSUB dZr1,dYr0,dYi3 |
| 290 VST2 {dZr2,dZi2},[pDst :128],outPointStep |
| 291 VHADD dZi1,dYi0,dYr3 |
| 292 |
| 293 VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep
= -outPointStep + 16 |
| 294 |
| 295 |
| 296 .else |
| 297 |
| 298 VHSUB qZ0,qY2,qY1 |
| 299 |
| 300 VHSUB dZr1,dYr0,dYi3 |
| 301 VST2 {dZr0,dZi0},[pDst :128],outPointStep |
| 302 VHADD dZi1,dYi0,dYr3 |
| 303 |
| 304 VHADD qZ2,qY2,qY1 |
| 305 VST2 {dZr1,dZi1},[pDst :128],outPointStep |
| 306 |
| 307 VHADD dZr3,dYr0,dYi3 |
| 308 VST2 {dZr2,dZi2},[pDst :128],outPointStep |
| 309 VHSUB dZi3,dYi0,dYr3 |
| 310 |
| 311 VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep
= -outPointStep + 16 |
| 312 |
| 313 |
| 314 .endif |
| 315 |
| 316 |
| 317 |
| 318 .else |
| 319 |
| 320 @// finish first stage of 4 point FFT |
| 321 |
| 322 VADD qY0,qZ0,qZ2 |
| 323 VSUB qY2,qZ0,qZ2 |
| 324 VADD qY1,qZ1,qZ3 |
| 325 VSUB qY3,qZ1,qZ3 |
| 326 |
| 327 |
| 328 @// finish second stage of 4 point FFT |
| 329 |
| 330 .ifeqs "\inverse", "TRUE" |
| 331 |
| 332 VSUB qZ0,qY2,qY1 |
| 333 |
| 334 VADD dZr3,dYr0,dYi3 |
| 335 VST2 {dZr0,dZi0},[pDst :128],outPointStep |
| 336 VSUB dZi3,dYi0,dYr3 |
| 337 |
| 338 VADD qZ2,qY2,qY1 |
| 339 VST2 {dZr3,dZi3},[pDst :128],outPointStep |
| 340 |
| 341 VSUB dZr1,dYr0,dYi3 |
| 342 VST2 {dZr2,dZi2},[pDst :128],outPointStep |
| 343 VADD dZi1,dYi0,dYr3 |
| 344 |
| 345 VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep
= -outPointStep + 16 |
| 346 |
| 347 |
| 348 .else |
| 349 |
| 350 VSUB qZ0,qY2,qY1 |
| 351 |
| 352 VSUB dZr1,dYr0,dYi3 |
| 353 VST2 {dZr0,dZi0},[pDst :128],outPointStep |
| 354 VADD dZi1,dYi0,dYr3 |
| 355 |
| 356 VADD qZ2,qY2,qY1 |
| 357 VST2 {dZr1,dZi1},[pDst :128],outPointStep |
| 358 |
| 359 VADD dZr3,dYr0,dYi3 |
| 360 VST2 {dZr2,dZi2},[pDst :128],outPointStep |
| 361 VSUB dZi3,dYi0,dYr3 |
| 362 |
| 363 VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep
= -outPointStep + 16 |
| 364 |
| 365 |
| 366 .endif |
| 367 |
| 368 .endif |
| 369 |
| 370 BGT grpLoop\name |
| 371 |
| 372 |
| 373 @// Reset and Swap pSrc and pDst for the next stage |
| 374 MOV pTmp,pDst |
| 375 SUB pSrc,pSrc,#64 @// Extra increment done in
final iteration of the loop |
| 376 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= 4*size; pSrc -=
8*size bytes |
| 377 SUB pSrc,pTmp,outPointStep |
| 378 SUB pTwiddle,pTwiddle,subFFTSize,LSL #1 |
| 379 SUB pTwiddle,pTwiddle,#16 @// Extra increment done in
final iteration of the loop |
| 380 |
| 381 .endm |
| 382 |
| 383 |
| 384 M_START armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4 |
| 385 FFTSTAGE "FALSE","FALSE",fwd |
| 386 M_END |
| 387 |
| 388 |
| 389 M_START armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4 |
| 390 FFTSTAGE "FALSE","TRUE",inv |
| 391 M_END |
| 392 |
| 393 |
| 394 M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4 |
| 395 FFTSTAGE "TRUE","FALSE",fwdsfs |
| 396 M_END |
| 397 |
| 398 |
| 399 M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4 |
| 400 FFTSTAGE "TRUE","TRUE",invsfs |
| 401 M_END |
| 402 |
| 403 |
| 404 .end |
OLD | NEW |