third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S - Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED

Side by Side Diff: third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S ('K') | « third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S ('k') | third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c » ('j') | third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 @//

	2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.

	3 @//

	4 @// Use of this source code is governed by a BSD-style license

	5 @// that can be found in the LICENSE file in the root of the source

	6 @// tree. An additional intellectual property rights grant can be found

	7 @// in the file PATENTS. All contributing project authors may

	8 @// be found in the AUTHORS file in the root of the source tree.

	9 @//

	10 @// This file was originally licensed as follows. It has been

	11 @// relicensed with permission from the copyright holders.

	12 @//

	13

	14 @//

	15 @// File Name: omxSP_FFTFwd_RToCCS_S32_Sfs_s.s

	16 @// OpenMAX DL: v1.0.2

	17 @// Last Modified Revision: 7810

	18 @// Last Modified Date: Thu, 04 Oct 2007

	19 @//

	20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.

	21 @//

	22 @//

	23 @//

	24 @// Description:

	25 @// Compute FFT for a real signal

	26 @//

	27

	28

	29

	30 @// Include standard headers

	31

	32 #include "dl/api/armCOMM_s.h"

	33 #include "dl/api/omxtypes_s.h"

	34

	35

	36 @// Import symbols required from other files

	37 @// (For example tables)

	38

	39 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe

	40 .extern armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe

	41 .extern armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe

	42 .extern armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe

	43 .extern armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe

	44 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe

	45 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe

	46 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe

	47 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe

	48 .extern armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe

	49

	50 @// Set debugging level

	51 @//DEBUG_ON SETL {TRUE}

	52

	53

	54

	55 @// Guarding implementation by the processor name

	56

	57

	58

	59 @// Guarding implementation by the processor name

	60

	61 @// Import symbols required from other files

	62 @// (For example tables)

	63 .extern armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe

	64 .extern armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe

	65 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe

	66 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe

	67

	68

	69 @//Input Registers

	70

	71 #define pSrc r0

	72 #define pDst r1

	73 #define pFFTSpec r2

	74 #define scale r3

	75

	76

	77 @// Output registers

	78 #define result r0

	79

	80 @//Local Scratch Registers

	81

	82 #define argTwiddle r1

	83 #define argDst r2

	84 #define argScale r4

	85 #define tmpOrder r4

	86 #define pTwiddle r4

	87 #define pOut r5

	88 #define subFFTSize r7

	89 #define subFFTNum r6

	90 #define N r6

	91 #define order r14

	92 #define diff r9

	93 @// Total num of radix stages required to comple the FFT

	94 #define count r8

	95 #define x0r r4

	96 #define x0i r5

	97 #define diffMinusOne r2

	98 #define subFFTSizeTmp r6

	99 #define step r3

	100 #define step1 r4

	101 #define twStep r8

	102 #define zero r9

	103 #define pTwiddleTmp r5

	104 #define t0 r10

	105

	106 @// Neon registers

	107

	108 #define dX0 d0.s32

	109 #define dzero d1.s32

	110 #define dZero d2.s32

	111 #define dShift d3.s32

	112 #define dX0r d2.s32

	113 #define dX0i d3.s32

	114 #define dX1r d4.s32

	115 #define dX1i d5.s32

	116 #define dT0 d6.s32

	117 #define dT1 d7.s32

	118 #define dT2 d8.s32

	119 #define dT3 d9.s32

	120 #define qT0 q5.s64

	121 #define qT1 q6.s64

	122 #define dW0r d14.s32

	123 #define dW0i d15.s32

	124 #define dW1r d16.s32

	125 #define dW1i d17.s32

	126 #define dY0r d14.s32

	127 #define dY0i d15.s32

	128 #define dY1r d16.s32

	129 #define dY1i d17.s32

	130 #define dY0rS64 d14.s64

	131 #define dY0iS64 d15.s64

	132 #define qT2 q9.s64

	133 #define qT3 q10.s64

	134 @// lastThreeelements

	135 #define dX1 d3.s32

	136 #define dW0 d4.s32

	137 #define dW1 d5.s32

	138 #define dY0 d10.s32

	139 #define dY1 d11.s32

	140 #define dY2 d12.s32

	141 #define dY3 d13.s32

	142

	143 @// Allocate stack memory required by the function

	144

	145 M_ALLOC4 diffOnStack, 4

	146

	147 @// Write function header

	148 M_START omxSP_FFTFwd_RToCCS_S32_Sfs,r11,d15

	149

	150 @ Structure offsets for the FFTSpec

	151 .set ARMsFFTSpec_N, 0

	152 .set ARMsFFTSpec_pBitRev, 4

	153 .set ARMsFFTSpec_pTwiddle, 8

	154 .set ARMsFFTSpec_pBuf, 12

	155

	156 @// Define stack arguments

	157

	158 @// Read the size from structure and take log

	159 LDR N, [pFFTSpec, #ARMsFFTSpec_N]

	160

	161 @// Read other structure parameters

	162 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]

	163 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

	164

	165 @// N=1 Treat seperately

	166 CMP N,#1

	167 BGT sizeGreaterThanOne

	168 VLD1 dX0[0],[pSrc]

	169 RSB scale,scale,#0 @// to use VRSHL for right shift by a variable

	170 MOV zero,#0

	171 VMOV dShift[0],scale

	172 VMOV dzero[0],zero

	173 VRSHL dX0,dShift

	174 VMOV dZero[0],zero

	175 VST3 {dX0[0],dzero[0],dZero[0]},[pDst]

	176

	177 B End

	178

	179

	180

	181 sizeGreaterThanOne:

	182 @// Do a N/2 point complex FFT including the scaling

	183

	184 MOV N,N,ASR #1 @// N/2 point complex FFT

	185

	186 CLZ order,N @// N = 2^order

	187 RSB order,order,#31

	188 MOV subFFTSize,#1

	189 @//MOV subFFTNum,N

	190

	191 CMP order,#3

	192 BGT orderGreaterthan3 @// order > 3

	193

	194 CMP order,#1

	195 BGE orderGreaterthan0 @// order > 0

	196 M_STR scale, diffOnStack,LT @// order = 0

	197 VLD1 dX0,[pSrc]

	198 VST1 dX0,[pOut]

	199 MOV pSrc,pOut

	200 MOV argDst,pDst

	201 BLT FFTEnd

	202

	203 orderGreaterthan0:

	204 @// set the buffers appropriately for various orders

	205 CMP order,#2

	206 MOVEQ argDst,pDst

	207 MOVNE argDst,pOut

	208 MOVNE pOut,pDst @// Pass the first stage des tination in RN5

	209 MOV argTwiddle,pTwiddle

	210

	211 SUBS diff,scale,order

	212 M_STR diff,diffOnStack

	213 MOVGT scale,order

	214 @// Now scale <= order

	215

	216 CMP order,#1

	217 BGT orderGreaterthan1

	218 SUBS scale,scale,#1

	219 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// orde r = 1

	220 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe @// orde r = 1

	221 B FFTEnd

	222

	223 orderGreaterthan1:

	224 CMP order,#2

	225 MOV argScale,scale

	226 BGT orderGreaterthan2

	227 SUBS argScale,argScale,#1

	228 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// order =2

	229 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe

	230 SUBS argScale,argScale,#1

	231 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe

	232 BLLT armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe

	233 B FFTEnd

	234

	235 orderGreaterthan2:@// order =3

	236 SUBS argScale,argScale,#1

	237 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe

	238 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe

	239 SUBS argScale,argScale,#1

	240 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe

	241 BLLT armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe

	242 SUBS argScale,argScale,#1

	243 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe

	244 BLLT armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe

	245 B FFTEnd

	246

	247

	248

	249 orderGreaterthan3:

	250 @// check scale = 0 or scale = order

	251 SUBS diff, scale, order @// scale > order

	252 MOVGT scale,order

	253 BGE specialScaleCase @// scale = 0 or scale = orde r

	254 CMP scale,#0

	255 BEQ specialScaleCase

	256 B generalScaleCase

	257

	258 specialScaleCase:@// scale = 0 or scale = order and order >= 2

	259

	260 TST order, #2 @// Set input args to fft st ages

	261 MOVEQ argDst,pDst

	262 MOVNE argDst,pOut

	263 MOVNE pOut,pDst @// Pass the first stage des tination in RN5

	264 MOV argTwiddle,pTwiddle

	265

	266 CMP diff,#0

	267 M_STR diff, diffOnStack

	268 BGE scaleEqualsOrder

	269

	270 @//check for even or odd order

	271 @// NOTE: The following combination of BL's would work fine eventhough t he first

	272 @// BL would corrupt the flags. This is because the end of the "grpZeroS etLoop" loop inside

	273 @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag t o EQ

	274

	275 TST order,#0x00000001

	276 BLEQ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe

	277 BLNE armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe

	278

	279 CMP subFFTNum,#4

	280 BLT FFTEnd

	281

	282

	283 unscaledRadix4Loop:

	284 BEQ lastStageUnscaledRadix4

	285 BL armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe

	286 CMP subFFTNum,#4

	287 B unscaledRadix4Loop

	288

	289 lastStageUnscaledRadix4:

	290 BL armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe

	291 B FFTEnd

	292

	293

	294 scaleEqualsOrder:

	295 @//check for even or odd order

	296 @// NOTE: The following combination of BL's would work fine eventhough t he first

	297 @// BL would corrupt the flags. This is because the end of the "grpZeroS etLoop" loop inside

	298 @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag t o EQ

	299

	300 TST order,#0x00000001

	301 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe

	302 BLNE armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe

	303

	304 CMP subFFTNum,#4

	305 BLT FFTEnd

	306

	307

	308 scaledRadix4Loop:

	309 BEQ lastStageScaledRadix4

	310 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe

	311 CMP subFFTNum,#4

	312 B scaledRadix4Loop

	313

	314 lastStageScaledRadix4:

	315 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe

	316 B FFTEnd

	317

	318 generalScaleCase:@// 0 < scale < order and order >= 2

	319 @// Determine the correct destination buffer

	320 SUB diff,order,scale

	321 TST diff,#0x01

	322 ADDEQ count, scale,diff,lsr #1 @// count = scale + (order - sc ale)/2

	323 MOVNE count, order

	324 TST count, #0x01 @// Is count even or odd ?

	325

	326 MOVEQ argDst,pDst @// Set input args to fft stages

	327 MOVNE argDst,pOut

	328 MOVNE pOut,pDst @// Pass the first stage destina tion in RN5

	329 MOV argTwiddle,pTwiddle

	330

	331 M_STR diff, diffOnStack

	332

	333 MOV argScale,scale @// Put scale in RN4 so as to sa ve and restore

	334 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// s caled first stage

	335 SUBS argScale,argScale,#1

	336

	337 scaledRadix2Loop:

	338 BLGT armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe

	339 SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages

	340 BGT scaledRadix2Loop

	341

	342

	343 M_LDR diff, diffOnStack

	344 @//check for even or odd order

	345 TST diff,#0x00000001

	346 BEQ generalUnscaledRadix4Loop

	347 B unscaledRadix2Loop

	348

	349 generalUnscaledRadix4Loop:

	350 CMP subFFTNum,#4

	351 BEQ generalLastStageUnscaledRadix4

	352 BL armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe

	353 B generalUnscaledRadix4Loop

	354

	355 generalLastStageUnscaledRadix4:

	356 BL armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe

	357 B finalComplexToRealFixup

	358

	359

	360 unscaledRadix2Loop:

	361 CMP subFFTNum,#2

	362 BEQ generalLastStageUnscaledRadix2

	363 BL armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe

	364 B unscaledRadix2Loop

	365

	366 generalLastStageUnscaledRadix2:

	367 BL armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe

	368 B finalComplexToRealFixup

	369

	370

	371 FFTEnd:@// Does only the scaling

	372

	373 M_LDR diff, diffOnStack

	374 CMP diff,#0

	375 BLE finalComplexToRealFixup

	376

	377 RSB diff,diff,#0 @// to use VRSHL for right s hift by a variable

	378 VDUP dShift,diff

	379

	380 @// save subFFTSize and use tmpsubfftsize in the folowwing loop

	381 MOV subFFTSizeTmp,subFFTSize @// subFFTSizeTmp same r eg as subFFTNum

	382

	383 scaleFFTData:@// N = subFFTSize ; dataptr = pDst ; scale = diff

	384 VLD1 {dX0},[pSrc] @// pSrc contains pDst pointer

	385 SUBS subFFTSizeTmp,subFFTSizeTmp,#1

	386 VRSHL dX0,dShift

	387 VST1 {dX0},[pSrc]!

	388

	389 BGT scaleFFTData

	390

	391 SUB pSrc,pSrc,subFFTSize,LSL #3 @// reset pSrc for final fixup

	392

	393 @// change the logic so that output after scaling is in pOut and not in pDst

	394 @// finally store from pOut to pDst

	395 @// change branch "End" to branch "finalComplexToRealFixup" in the abov e

	396 @// chk the code below for multiplication by j factor

	397

	398 finalComplexToRealFixup:

	399

	400

	401 @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]

	402 @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]

	403 @// 1/2[2a+j0] - j [0+j2b]

	404 @// (a+b, 0)

	405

	406 @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]

	407 @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]

	408 @// 1/2[2a+j0] + j [0+j2b]

	409 @// (a-b, 0)

	410

	411 @// F(0) and F(N/2)

	412 VLD2 {dX0r[0],dX0i[0]},[pSrc]!

	413 MOV zero,#0

	414 VMOV dX0r[1],zero

	415 MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes

	416 VMOV dX0i[1],zero

	417 SUB twStep,step,subFFTSize,LSL #1 @// twStep = 3N/8 * 8 by tes pointing to W^1

	418

	419 VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)

	420 MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 byte s

	421 VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0 .i) , 0)

	422 SUBS subFFTSize,subFFTSize,#2

	423

	424 VST1 dY0r,[argDst],step

	425 ADD pTwiddleTmp,argTwiddle,#8 @// W^2

	426 VST1 dY0i,[argDst]!

	427 ADD argTwiddle,argTwiddle,twStep @// W^1

	428

	429 VDUP dzero,zero

	430 SUB argDst,argDst,step

	431

	432 BLT End

	433 BEQ lastElement

	434 SUB step,step,#24

	435 SUB step1,step1,#8 @// (N/4-1)*8 bytes

	436

	437 @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]

	438 @// Note: W^k is stored as negative values in the table

	439 @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) s ince both of them

	440 @// require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)

	441

	442

	443 evenOddButterflyLoop:

	444

	445

	446 VLD1 dW0r,[argTwiddle],step1

	447 VLD1 dW1r,[argTwiddle]!

	448

	449 VLD2 {dX0r,dX0i},[pSrc],step

	450 SUB argTwiddle,argTwiddle,step1

	451 VLD2 {dX1r,dX1i},[pSrc]!

	452

	453

	454

	455 SUB step1,step1,#8 @// (N/4-2)*8 bytes

	456 VLD1 dW0i,[pTwiddleTmp],step1

	457 VLD1 dW1i,[pTwiddleTmp]!

	458 SUB pSrc,pSrc,step

	459

	460 SUB pTwiddleTmp,pTwiddleTmp,step1

	461 VREV64 dX1r,dX1r

	462 VREV64 dX1i,dX1i

	463 SUBS subFFTSize,subFFTSize,#4

	464

	465

	466

	467 VSUB dT2,dX0r,dX1r @// a-c

	468 SUB step1,step1,#8

	469 VADD dT3,dX0i,dX1i @// b+d

	470 VADD dT0,dX0r,dX1r @// a+c

	471 VSUB dT1,dX0i,dX1i @// b-d

	472 VHADD dT0,dT0,dzero

	473 VHADD dT1,dT1,dzero

	474

	475 VZIP dW1r,dW1i

	476 vzip dW0r,dW0i

	477

	478

	479 VMULL qT0,dW1r,dT2

	480 VMLAL qT0,dW1i,dT3

	481 VMULL qT1,dW1r,dT3

	482 VMLSL qT1,dW1i,dT2

	483

	484 VMULL qT2,dW0r,dT2

	485 VMLSL qT2,dW0i,dT3

	486 VMULL qT3,dW0r,dT3

	487 VMLAL qT3,dW0i,dT2

	488

	489

	490 VRSHRN dX1r,qT0,#32

	491 VRSHRN dX1i,qT1,#32

	492

	493 VSUB dY1r,dT0,dX1i @// F(N/2 -1)

	494 VADD dY1i,dT1,dX1r

	495 VNEG dY1i,dY1i

	496

	497 VREV64 dY1r,dY1r

	498 VREV64 dY1i,dY1i

	499

	500

	501 VRSHRN dX0r,qT2,#32

	502 VRSHRN dX0i,qT3,#32

	503

	504

	505 VSUB dY0r,dT0,dX0i @// F(1)

	506 VADD dY0i,dT1,dX0r

	507

	508

	509 VST2 {dY0r,dY0i},[argDst],step

	510 VST2 {dY1r,dY1i},[argDst]!

	511 SUB argDst,argDst,step

	512 SUB step,step,#32 @// (N/2-4)*8 bytes

	513

	514

	515 BGT evenOddButterflyLoop

	516

	517 SUB pSrc,pSrc,#8 @// set both the ptrs to the last el ement

	518 SUB argDst,argDst,#8

	519

	520

	521

	522 @// Last element can be expanded as follows

	523 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]

	524 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]

	525 @// 1/2[2a+j0] + j (c+jd) [0+j2b]

	526 @// (a-bc, -bd)

	527 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)

	528

	529 lastElement:

	530 VLD1 dX0r,[pSrc]

	531

	532 VST1 dX0r[0],[argDst]!

	533 VNEG dX0r,dX0r

	534 VST1 dX0r[1],[argDst]!

	535

	536

	537

	538

	539

	540

	541 End:

	542 @// Set return value

	543 MOV result, #OMX_Sts_NoErr

	544

	545 @// Write function tail

	546 M_END

	547

	548 .end

	549

OLD	NEW