Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 @// | |
| 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. | |
| 3 @// | |
| 4 @// Use of this source code is governed by a BSD-style license | |
| 5 @// that can be found in the LICENSE file in the root of the source | |
| 6 @// tree. An additional intellectual property rights grant can be found | |
| 7 @// in the file PATENTS. All contributing project authors may | |
| 8 @// be found in the AUTHORS file in the root of the source tree. | |
| 9 @// | |
| 10 @// This file was originally licensed as follows. It has been | |
| 11 @// relicensed with permission from the copyright holders. | |
| 12 | |
| 13 @// | |
| 14 @// | |
| 15 @// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s | |
| 16 @// OpenMAX DL: v1.0.2 | |
| 17 @// Last Modified Revision: 7761 | |
| 18 @// Last Modified Date: Wed, 26 Sep 2007 | |
| 19 @// | |
| 20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. | |
| 21 @// | |
| 22 @// | |
| 23 @// | |
| 24 @// Description: | |
| 25 @// Compute a Radix 4 FFT stage for a N point complex signal | |
| 26 @// | |
| 27 @// | |
| 28 | |
| 29 | |
| 30 @// Include standard headers | |
| 31 | |
| 32 #include "dl/api/armCOMM_s.h" | |
| 33 #include "dl/api/omxtypes_s.h" | |
| 34 | |
| 35 | |
| 36 | |
| 37 @// Import symbols required from other files | |
| 38 @// (For example tables) | |
| 39 | |
| 40 | |
| 41 | |
| 42 | |
| 43 @// Set debugging level | |
| 44 @//DEBUG_ON SETL {TRUE} | |
| 45 | |
| 46 | |
| 47 @// Guarding implementation by the processor name | |
| 48 | |
| 49 | |
| 50 | |
| 51 @// Guarding implementation by the processor name | |
| 52 | |
| 53 | |
| 54 @// Import symbols required from other files | |
| 55 @// (For example tables) | |
| 56 | |
| 57 | |
| 58 @//Input Registers | |
| 59 | |
| 60 #define pSrc r0 | |
| 61 #define pDst r2 | |
| 62 #define pTwiddle r1 | |
| 63 #define subFFTNum r6 | |
| 64 #define subFFTSize r7 | |
| 65 | |
| 66 | |
| 67 | |
| 68 @//Output Registers | |
| 69 | |
| 70 | |
| 71 @//Local Scratch Registers | |
| 72 | |
| 73 #define grpCount r3 | |
| 74 #define pointStep r4 | |
| 75 #define outPointStep r5 | |
| 76 #define stepTwiddle r12 | |
| 77 #define setCount r14 | |
| 78 #define srcStep r8 | |
| 79 #define setStep r9 | |
| 80 #define dstStep r10 | |
| 81 #define twStep r11 | |
| 82 #define t1 r3 | |
| 83 | |
| 84 @// Neon Registers | |
| 85 | |
| 86 #define dW1 D0.S16 | |
| 87 #define dW2 D1.S16 | |
| 88 #define dW3 D2.S16 | |
| 89 | |
| 90 #define dXr0 D4.S16 | |
| 91 #define dXi0 D5.S16 | |
| 92 #define dXr1 D6.S16 | |
| 93 #define dXi1 D7.S16 | |
| 94 #define dXr2 D8.S16 | |
| 95 #define dXi2 D9.S16 | |
| 96 #define dXr3 D10.S16 | |
| 97 #define dXi3 D11.S16 | |
| 98 #define dYr0 D12.S16 | |
| 99 #define dYi0 D13.S16 | |
| 100 #define dYr1 D14.S16 | |
| 101 #define dYi1 D15.S16 | |
| 102 #define dYr2 D16.S16 | |
| 103 #define dYi2 D17.S16 | |
| 104 #define dYr3 D18.S16 | |
| 105 #define dYi3 D19.S16 | |
| 106 #define qT0 Q8.S32 | |
| 107 #define qT1 Q9.S32 | |
| 108 #define qT2 Q6.S32 | |
| 109 #define qT3 Q7.S32 | |
| 110 | |
| 111 #define dZr0 D20.S16 | |
| 112 #define dZi0 D21.S16 | |
| 113 #define dZr1 D22.S16 | |
| 114 #define dZi1 D23.S16 | |
| 115 #define dZr2 D24.S16 | |
| 116 #define dZi2 D25.S16 | |
| 117 #define dZr3 D26.S16 | |
| 118 #define dZi3 D27.S16 | |
| 119 #define qY0 Q6.S16 | |
| 120 #define qY1 Q7.S16 | |
| 121 #define qY2 Q8.S16 | |
| 122 #define qY3 Q9.S16 | |
| 123 #define qX0 Q2.S16 | |
| 124 #define qZ0 Q10.S16 | |
| 125 #define qZ1 Q11.S16 | |
| 126 #define qZ2 Q12.S16 | |
| 127 #define qZ3 Q13.S16 | |
| 128 | |
| 129 | |
| 130 .MACRO FFTSTAGE scaled, inverse , name | |
| 131 | |
| 132 @// Define stack arguments | |
| 133 | |
| 134 | |
| 135 @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs | |
| 136 | |
| 137 LSL grpCount,subFFTSize,#2 | |
| 138 LSR subFFTNum,subFFTNum,#2 | |
| 139 MOV subFFTSize,grpCount | |
| 140 | |
| 141 | |
| 142 @// pOut0+1 increments pOut0 by 4 bytes | |
| 143 @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes | |
| 144 | |
| 145 MOV stepTwiddle,#0 | |
| 146 SMULBB outPointStep,grpCount,subFFTNum | |
| 147 | |
| 148 @// pT0+1 increments pT0 by 4 bytes | |
| 149 @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes | |
| 150 | |
| 151 LSL pointStep,subFFTNum,#2 @// 2*grpSize | |
| 152 | |
| 153 VLD1 dW1,[pTwiddle :64] @//[wi | wr] | |
| 154 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*poin tStep | |
| 155 VLD1 dW2,[pTwiddle :64] @//[wi | wr] | |
| 156 ADD setStep,srcStep,pointStep @// setStep = 3*poin tStep | |
| 157 SUB srcStep,srcStep,#16 @// srcStep = 2*poin tStep-16 | |
| 158 VLD1 dW3,[pTwiddle :64] | |
| 159 @//RSB setStep,setStep,#16 @// setStep = - 3*po intStep+16 | |
| 160 RSB setStep,setStep,#0 @// setStep = - 3*po intStep | |
| 161 | |
| 162 MOV dstStep,outPointStep,LSL #1 | |
| 163 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outP ointStep | |
| 164 RSB dstStep,dstStep,#16 @// dstStep = - 3*ou tPointStep+16 | |
| 165 | |
| 166 | |
| 167 | |
| 168 grpLoop\name: | |
| 169 | |
| 170 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] | |
| 171 ADD stepTwiddle,stepTwiddle,pointStep | |
| 172 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] | |
| 173 ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point | |
| 174 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] | |
| 175 MOV twStep,stepTwiddle,LSL #2 | |
| 176 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc | |
| 177 | |
| 178 SUB twStep,stepTwiddle,twStep @// twStep = -3*ste pTwiddle | |
| 179 | |
| 180 | |
| 181 MOV setCount,pointStep,LSR #2 | |
| 182 ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set | |
| 183 ADD pSrc,pSrc,pointStep @// increment to data[1] o f the next set | |
| 184 | |
| 185 @// Loop on the sets : 4 at a time | |
| 186 | |
| 187 setLoop\name: | |
| 188 | |
| 189 SUBS setCount,setCount,#4 @// decrement the loop c ounter | |
| 190 | |
| 191 .ifeqs "\inverse", "TRUE" | |
| 192 VMULL qT0,dXr1,dW1[0] | |
| 193 VMLAL qT0,dXi1,dW1[1] @// real part | |
| 194 VMULL qT1,dXi1,dW1[0] | |
| 195 VMLSL qT1,dXr1,dW1[1] @// imag part | |
| 196 | |
| 197 .ELSE | |
| 198 VMULL qT0,dXr1,dW1[0] | |
| 199 VMLSL qT0,dXi1,dW1[1] @// real part | |
| 200 VMULL qT1,dXi1,dW1[0] | |
| 201 VMLAL qT1,dXr1,dW1[1] @// imag part | |
| 202 | |
| 203 .ENDIF | |
| 204 | |
| 205 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] | |
| 206 | |
| 207 .ifeqs "\inverse", "TRUE" | |
| 208 VMULL qT2,dXr2,dW2[0] | |
| 209 VMLAL qT2,dXi2,dW2[1] @// real part | |
| 210 VMULL qT3,dXi2,dW2[0] | |
| 211 VMLSL qT3,dXr2,dW2[1] @// imag part | |
| 212 | |
| 213 .ELSE | |
| 214 VMULL qT2,dXr2,dW2[0] | |
| 215 VMLSL qT2,dXi2,dW2[1] @// real part | |
| 216 VMULL qT3,dXi2,dW2[0] | |
| 217 VMLAL qT3,dXr2,dW2[1] @// imag part | |
| 218 | |
| 219 .ENDIF | |
| 220 | |
| 221 VRSHRN dZr1,qT0,#15 | |
| 222 VRSHRN dZi1,qT1,#15 | |
| 223 | |
| 224 | |
| 225 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] | |
| 226 | |
| 227 .ifeqs "\inverse", "TRUE" | |
| 228 VMULL qT0,dXr3,dW3[0] | |
| 229 VMLAL qT0,dXi3,dW3[1] @// real part | |
| 230 VMULL qT1,dXi3,dW3[0] | |
| 231 VMLSL qT1,dXr3,dW3[1] @// imag part | |
| 232 | |
| 233 .ELSE | |
| 234 VMULL qT0,dXr3,dW3[0] | |
| 235 VMLSL qT0,dXi3,dW3[1] @// real part | |
| 236 VMULL qT1,dXi3,dW3[0] | |
| 237 VMLAL qT1,dXr3,dW3[1] @// imag part | |
| 238 | |
| 239 .ENDIF | |
| 240 | |
| 241 VRSHRN dZr2,qT2,#15 | |
| 242 VRSHRN dZi2,qT3,#15 | |
| 243 | |
| 244 | |
| 245 VRSHRN dZr3,qT0,#15 | |
| 246 VRSHRN dZi3,qT1,#15 | |
| 247 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set | |
|
aedla
2013/06/26 20:24:46
16 byte OOB read when both loops are at their last
| |
| 248 | |
| 249 | |
| 250 .ifeqs "\scaled", "TRUE" | |
| 251 | |
| 252 @// finish first stage of 4 point FFT | |
| 253 VHADD qY0,qX0,qZ2 | |
| 254 VHSUB qY2,qX0,qZ2 | |
| 255 | |
| 256 VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] | |
| 257 VHADD qY1,qZ1,qZ3 | |
| 258 VHSUB qY3,qZ1,qZ3 | |
| 259 | |
| 260 | |
| 261 @// finish second stage of 4 point FFT | |
| 262 | |
| 263 .ifeqs "\inverse", "TRUE" | |
| 264 | |
| 265 VHSUB qZ0,qY2,qY1 | |
| 266 | |
| 267 VHADD dZr2,dYr0,dYi3 | |
| 268 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
| 269 VHSUB dZi2,dYi0,dYr3 | |
| 270 | |
| 271 VHADD qZ1,qY2,qY1 | |
| 272 VST2 {dZr2,dZi2},[pDst :128],outPointStep | |
| 273 | |
| 274 VHSUB dZr3,dYr0,dYi3 | |
| 275 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
| 276 VHADD dZi3,dYi0,dYr3 | |
| 277 VST2 {dZr3,dZi3},[pDst :128],dstStep | |
| 278 | |
| 279 | |
| 280 .ELSE | |
| 281 | |
| 282 VHSUB qZ0,qY2,qY1 | |
| 283 | |
| 284 VHSUB dZr3,dYr0,dYi3 | |
| 285 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
| 286 VHADD dZi3,dYi0,dYr3 | |
| 287 | |
| 288 VHADD qZ1,qY2,qY1 | |
| 289 VST2 {dZr3,dZi3},[pDst :128],outPointStep | |
| 290 | |
| 291 VHADD dZr2,dYr0,dYi3 | |
| 292 VHSUB dZi2,dYi0,dYr3 | |
| 293 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
| 294 VST2 {dZr2,dZi2},[pDst :128],dstStep | |
| 295 | |
| 296 | |
| 297 .ENDIF | |
| 298 | |
| 299 | |
| 300 .ELSE | |
| 301 | |
| 302 @// finish first stage of 4 point FFT | |
| 303 VADD qY0,qX0,qZ2 | |
| 304 VSUB qY2,qX0,qZ2 | |
| 305 | |
| 306 VLD2 {dXr0,dXi0},[pSrc]! @// data[0] | |
| 307 VADD qY1,qZ1,qZ3 | |
| 308 VSUB qY3,qZ1,qZ3 | |
| 309 | |
| 310 | |
| 311 @// finish second stage of 4 point FFT | |
| 312 | |
| 313 | |
| 314 .ifeqs "\inverse", "TRUE" | |
| 315 | |
| 316 VSUB qZ0,qY2,qY1 | |
| 317 | |
| 318 VADD dZr2,dYr0,dYi3 | |
| 319 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
| 320 VSUB dZi2,dYi0,dYr3 | |
| 321 | |
| 322 VADD qZ1,qY2,qY1 | |
| 323 VST2 {dZr2,dZi2},[pDst :128],outPointStep | |
| 324 | |
| 325 VSUB dZr3,dYr0,dYi3 | |
| 326 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
| 327 VADD dZi3,dYi0,dYr3 | |
| 328 VST2 {dZr3,dZi3},[pDst :128],dstStep | |
| 329 | |
| 330 | |
| 331 .ELSE | |
| 332 | |
| 333 VSUB qZ0,qY2,qY1 | |
| 334 | |
| 335 VSUB dZr3,dYr0,dYi3 | |
| 336 VST2 {dZr0,dZi0},[pDst :128],outPointStep | |
| 337 VADD dZi3,dYi0,dYr3 | |
| 338 | |
| 339 VADD qZ1,qY2,qY1 | |
| 340 VST2 {dZr3,dZi3},[pDst :128],outPointStep | |
| 341 | |
| 342 VADD dZr2,dYr0,dYi3 | |
| 343 VSUB dZi2,dYi0,dYr3 | |
| 344 VST2 {dZr1,dZi1},[pDst :128],outPointStep | |
| 345 VST2 {dZr2,dZi2},[pDst :128],dstStep | |
| 346 | |
| 347 | |
| 348 .ENDIF | |
| 349 | |
| 350 | |
| 351 | |
| 352 .ENDIF | |
| 353 | |
| 354 ADD pSrc,pSrc,pointStep @// increment to dat a[1] of the next set | |
| 355 BGT setLoop\name | |
| 356 | |
| 357 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr] | |
| 358 SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4 | |
| 359 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr] | |
| 360 ADD pSrc,pSrc,srcStep @// increment pSrc f or the next grp | |
| 361 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr] | |
|
aedla
2013/06/26 20:24:46
8 byte OOB read at the last iteration, coming from
| |
| 362 | |
| 363 | |
| 364 | |
| 365 BGT grpLoop\name | |
| 366 | |
| 367 | |
| 368 @// Reset and Swap pSrc and pDst for the next stage | |
| 369 MOV t1,pDst | |
| 370 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc - = 4*size bytes | |
| 371 SUB pSrc,t1,outPointStep | |
| 372 | |
| 373 | |
| 374 .endm | |
| 375 | |
| 376 | |
| 377 M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4 | |
| 378 FFTSTAGE "FALSE","FALSE",FWD | |
| 379 M_END | |
| 380 | |
| 381 | |
| 382 M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4 | |
| 383 FFTSTAGE "FALSE","TRUE",INV | |
| 384 M_END | |
| 385 | |
| 386 | |
| 387 M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4 | |
| 388 FFTSTAGE "TRUE","FALSE",FWDSFS | |
| 389 M_END | |
| 390 | |
| 391 | |
| 392 M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4 | |
| 393 FFTSTAGE "TRUE","TRUE",INVSFS | |
| 394 M_END | |
| 395 | |
| 396 | |
| 397 | |
| 398 | |
| 399 | |
| 400 .END | |
| OLD | NEW |