third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S - Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED

Side by Side Diff: third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S ('K') | « third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S ('k') | third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S » ('j') | third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 @//

	2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.

	3 @//

	4 @// Use of this source code is governed by a BSD-style license

	5 @// that can be found in the LICENSE file in the root of the source

	6 @// tree. An additional intellectual property rights grant can be found

	7 @// in the file PATENTS. All contributing project authors may

	8 @// be found in the AUTHORS file in the root of the source tree.

	9 @//

	10 @// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s

	11 @// to support float instead of SC32.

	12 @//

	13

	14 @//

	15 @// Description:

	16 @// Compute FFT for a real signal

	17 @//

	18 @//

	19

	20

	21 @// Include standard headers

	22

	23 #include "dl/api/armCOMM_s.h"

	24 #include "dl/api/omxtypes_s.h"

	25

	26

	27 @// Import symbols required from other files

	28 @// (For example tables)

	29

	30 .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe

	31 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe

	32 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe

	33 .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe

	34 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe

	35 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe

	36 .extern armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe

	37

	38 @// Set debugging level

	39 @//DEBUG_ON SETL {TRUE}

	40

	41

	42

	43 @// Guarding implementation by the processor name

	44

	45

	46

	47 @// Guarding implementation by the processor name

	48

	49 @// Import symbols required from other files

	50 @// (For example tables)

	51 .extern armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe

	52 .extern armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe

	53

	54

	55 @//Input Registers

	56

	57 #define pSrc r0

	58 #define pDst r1

	59 #define pFFTSpec r2

	60 #define scale r3

	61

	62

	63 @// Output registers

	64 #define result r0

	65

	66 @//Local Scratch Registers

	67

	68 #define argTwiddle r1

	69 #define argDst r2

	70 #define argScale r4

	71 #define tmpOrder r4

	72 #define pTwiddle r4

	73 #define pOut r5

	74 #define subFFTSize r7

	75 #define subFFTNum r6

	76 #define N r6

	77 #define order r14

	78 #define diff r9

	79 @// Total num of radix stages required to comple the FFT

	80 #define count r8

	81 #define x0r r4

	82 #define x0i r5

	83 #define diffMinusOne r2

	84 #define subFFTSizeTmp r6

	85 #define step r3

	86 #define step1 r4

	87 #define twStep r8

	88 #define zero r9

	89 #define pTwiddleTmp r5

	90 #define t0 r10

	91

	92 @// Neon registers

	93

	94 #define dX0 d0.f32

	95 #define dzero d1.f32

	96 #define dZero d2.f32

	97 #define dShift d3.f32

	98 #define dX0r d2.f32

	99 #define dX0i d3.f32

	100 #define dX1r d4.f32

	101 #define dX1i d5.f32

	102 #define dT0 d6.f32

	103 #define dT1 d7.f32

	104 #define dT2 d8.f32

	105 #define dT3 d9.f32

	106 #define qT0 d10.f32

	107 #define qT1 d12.f32

	108 #define dW0r d14.f32

	109 #define dW0i d15.f32

	110 #define dW1r d16.f32

	111 #define dW1i d17.f32

	112 #define dY0r d14.f32

	113 #define dY0i d15.f32

	114 #define dY1r d16.f32

	115 #define dY1i d17.f32

	116 #define dY0rS64 d14.s64

	117 #define dY0iS64 d15.s64

	118 #define qT2 d18.f32

	119 #define qT3 d20.f32

	120 @// lastThreeelements

	121 #define dX1 d3.f32

	122 #define dW0 d4.f32

	123 #define dW1 d5.f32

	124 #define dY0 d10.f32

	125 #define dY1 d11.f32

	126 #define dY2 d12.f32

	127 #define dY3 d13.f32

	128

	129 #define half d0.f32

	130

	131 HALF: .float 0.5

	132

	133 @// Allocate stack memory required by the function

	134

	135 @// Write function header

	136 M_START omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15

	137

	138 @ Structure offsets for the FFTSpec

	139 .set ARMsFFTSpec_N, 0

	140 .set ARMsFFTSpec_pBitRev, 4

	141 .set ARMsFFTSpec_pTwiddle, 8

	142 .set ARMsFFTSpec_pBuf, 12

	143

	144 @// Define stack arguments

	145

	146 @// Read the size from structure and take log

	147 LDR N, [pFFTSpec, #ARMsFFTSpec_N]

	148

	149 @// Read other structure parameters

	150 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]

	151 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

	152

	153 @// N=1 Treat seperately

	154 CMP N,#1

	155 BGT sizeGreaterThanOne

	156 VLD1 dX0[0],[pSrc]

	157 MOV zero,#0

	158 VMOV dzero[0],zero

	159 VMOV dZero[0],zero

	160 VST3 {dX0[0],dzero[0],dZero[0]},[pDst]

	161

	162 B End

	163

	164

	165

	166 sizeGreaterThanOne:

	167 @// Do a N/2 point complex FFT including the scaling

	168

	169 MOV N,N,ASR #1 @// N/2 point complex FFT

	170

	171 CLZ order,N @// N = 2^order

	172 RSB order,order,#31

	173 MOV subFFTSize,#1

	174 @//MOV subFFTNum,N

	175

	176 CMP order,#3

	177 BGT orderGreaterthan3 @// order > 3

	178

	179 CMP order,#1

	180 BGE orderGreaterthan0 @// order > 0

	181 VLD1 dX0,[pSrc]

	182 VST1 dX0,[pOut]

	183 MOV pSrc,pOut

	184 MOV argDst,pDst

	185 BLT FFTEnd

	186

	187 orderGreaterthan0:

	188 @// set the buffers appropriately for various orders

	189 CMP order,#2

	190 MOVEQ argDst,pDst

	191 MOVNE argDst,pOut

	192 @// Pass the first stage destination in RN5

	193 MOVNE pOut,pDst

	194 MOV argTwiddle,pTwiddle

	195

	196 CMP order,#1

	197 BGT orderGreaterthan1

	198 @// order = 1

	199 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe

	200 B FFTEnd

	201

	202 orderGreaterthan1:

	203 CMP order,#2

	204 BGT orderGreaterthan2

	205 @// order =2

	206 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe

	207 BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe

	208 B FFTEnd

	209

	210 orderGreaterthan2:@// order =3

	211 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe

	212 BL armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe

	213 BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe

	214

	215 B FFTEnd

	216

	217

	218

	219 orderGreaterthan3:

	220 specialScaleCase:

	221

	222 @// Set input args to fft stages

	223 TST order, #2

	224 MOVEQ argDst,pDst

	225 MOVNE argDst,pOut

	226 @// Pass the first stage destination in RN5

	227 MOVNE pOut,pDst

	228 MOV argTwiddle,pTwiddle

	229

	230 @//check for even or odd order

	231 @// NOTE: The following combination of BL's would work fine even though

	232 @// the first BL would corrupt the flags. This is because the end of

	233 @// the "grpZeroSetLoop" loop inside

	234 @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag

	235 @// to EQ

	236

	237 TST order,#0x00000001

	238 BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe

	239 BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe

	240

	241 CMP subFFTNum,#4

	242 BLT FFTEnd

	243

	244

	245 unscaledRadix4Loop:

	246 BEQ lastStageUnscaledRadix4

	247 BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe

	248 CMP subFFTNum,#4

	249 B unscaledRadix4Loop

	250

	251 lastStageUnscaledRadix4:

	252 BL armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe

	253 B FFTEnd

	254

	255

	256 FFTEnd:

	257 finalComplexToRealFixup:

	258

	259

	260 @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]

	261 @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]

	262 @// 1/2[2a+j0] - j [0+j2b]

	263 @// (a+b, 0)

	264

	265 @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]

	266 @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]

	267 @// 1/2[2a+j0] + j [0+j2b]

	268 @// (a-b, 0)

	269

	270 @// F(0) and F(N/2)

	271 VLD2 {dX0r[0],dX0i[0]},[pSrc]!

	272 MOV zero,#0

	273 VMOV dX0r[1],zero

	274 MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes

	275 VMOV dX0i[1],zero

	276 @// twStep = 3N/8 * 8 bytes pointing to W^1

	277 SUB twStep,step,subFFTSize,LSL #1

	278

	279 VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)

	280 MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes

	281 VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)

	282 SUBS subFFTSize,subFFTSize,#2

	283

	284 VST1 dY0r,[argDst],step

	285 ADD pTwiddleTmp,argTwiddle,#8 @// W^2

	286 VST1 dY0i,[argDst]!

	287 ADD argTwiddle,argTwiddle,twStep @// W^1

	288

	289 VDUP dzero,zero

	290 SUB argDst,argDst,step

	291

	292 BLT End

	293 BEQ lastElement

	294 SUB step,step,#24

	295 SUB step1,step1,#8 @// (N/4-1)*8 bytes

	296

	297 @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]

	298 @// Note: W^k is stored as negative values in the table

	299 @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)

	300 @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)

	301

	302

	303 LDR t0, =HALF

	304 VLD1 half[0], [t0]

	305

	306 evenOddButterflyLoop:

	307

	308

	309 VLD1 dW0r,[argTwiddle],step1

	310 VLD1 dW1r,[argTwiddle]!

	311

	312 VLD2 {dX0r,dX0i},[pSrc],step

	313 SUB argTwiddle,argTwiddle,step1

	314 VLD2 {dX1r,dX1i},[pSrc]!

	315

	316

	317

	318 SUB step1,step1,#8 @// (N/4-2)*8 bytes

	319 VLD1 dW0i,[pTwiddleTmp],step1

	320 VLD1 dW1i,[pTwiddleTmp]!

	321 SUB pSrc,pSrc,step

	322

	323 SUB pTwiddleTmp,pTwiddleTmp,step1

	324 VREV64 dX1r,dX1r

	325 VREV64 dX1i,dX1i

	326 SUBS subFFTSize,subFFTSize,#4

	327

	328

	329

	330 VSUB dT2,dX0r,dX1r @// a-c

	331 SUB step1,step1,#8

	332 VADD dT0,dX0r,dX1r @// a+c

	333 VSUB dT1,dX0i,dX1i @// b-d

	334 VADD dT3,dX0i,dX1i @// b+d

	335 VMUL dT0,dT0,half[0]

	336 VMUL dT1,dT1,half[0]

	337 VZIP dW1r,dW1i

	338 VZIP dW0r,dW0i

	339

	340

	341 VMUL qT0,dW1r,dT2

	342 VMUL qT1,dW1r,dT3

	343 VMUL qT2,dW0r,dT2

	344 VMUL qT3,dW0r,dT3

	345

	346 VMLA qT0,dW1i,dT3

	347 VMLS qT1,dW1i,dT2

	348

	349 VMLS qT2,dW0i,dT3

	350 VMLA qT3,dW0i,dT2

	351

	352

	353 VMUL dX1r,qT0,half[0]

	354 VMUL dX1i,qT1,half[0]

	355

	356 VSUB dY1r,dT0,dX1i @// F(N/2 -1)

	357 VADD dY1i,dT1,dX1r

	358 VNEG dY1i,dY1i

	359

	360 VREV64 dY1r,dY1r

	361 VREV64 dY1i,dY1i

	362

	363

	364 VMUL dX0r,qT2,half[0]

	365 VMUL dX0i,qT3,half[0]

	366

	367 VSUB dY0r,dT0,dX0i @// F(1)

	368 VADD dY0i,dT1,dX0r

	369

	370

	371 VST2 {dY0r,dY0i},[argDst],step

	372 VST2 {dY1r,dY1i},[argDst]!

	373 SUB argDst,argDst,step

	374 SUB step,step,#32 @// (N/2-4)*8 bytes

	375

	376

	377 BGT evenOddButterflyLoop

	378

	379 @// set both the ptrs to the last element

	380 SUB pSrc,pSrc,#8

	381 SUB argDst,argDst,#8

	382

	383

	384

	385 @// Last element can be expanded as follows

	386 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]

	387 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]

	388 @// 1/2[2a+j0] + j (c+jd) [0+j2b]

	389 @// (a-bc, -bd)

	390 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)

	391

	392 lastElement:

	393 VLD1 dX0r,[pSrc]

	394

	395 VST1 dX0r[0],[argDst]!

	396 VNEG dX0r,dX0r

	397 VST1 dX0r[1],[argDst]!

	398

	399 End:

	400 @// Set return value

	401 MOV result, #OMX_Sts_NoErr

	402

	403 @// Write function tail

	404 M_END

	405

	406 .end

OLD	NEW