third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S - Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/openmax_dl/dl/sp/api/omxSP.h ('k') | third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S » ('j') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 @//

	2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.

	3 @//

	4 @// Use of this source code is governed by a BSD-style license

	5 @// that can be found in the LICENSE file in the root of the source

	6 @// tree. An additional intellectual property rights grant can be found

	7 @// in the file PATENTS. All contributing project authors may

	8 @// be found in the AUTHORS file in the root of the source tree.

	9 @//

	10 @// This is a modification of

	11 @// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float

	12 @// instead of SC32.

	13 @//

	14

	15 @//

	16 @// Description:

	17 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT

	18 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation

	19 @//

	20 @//

	21

	22

	23 @// Include standard headers

	24

	25 #include "dl/api/armCOMM_s.h"

	26 #include "dl/api/omxtypes_s.h"

	27

	28

	29 @// Import symbols required from other files

	30 @// (For example tables)

	31

	32

	33 @// Set debugging level

	34 @//DEBUG_ON SETL {TRUE}

	35

	36

	37

	38 @// Guarding implementation by the processor name

	39

	40

	41

	42 @// Guarding implementation by the processor name

	43

	44

	45

	46 @//Input Registers

	47

	48 #define pSrc r0

	49 #define pDst r1

	50 #define pFFTSpec r2

	51 #define scale r3

	52

	53

	54 @// Output registers

	55 #define result r0

	56

	57 @//Local Scratch Registers

	58

	59 #define argTwiddle r1

	60 #define argDst r2

	61 #define argScale r4

	62 #define tmpOrder r4

	63 #define pTwiddle r4

	64 #define pOut r5

	65 #define subFFTSize r7

	66 #define subFFTNum r6

	67 #define N r6

	68 #define order r14

	69 #define diff r9

	70 @// Total num of radix stages required to complete the FFT

	71 #define count r8

	72 #define x0r r4

	73 #define x0i r5

	74 #define diffMinusOne r2

	75 #define round r3

	76

	77 #define pOut1 r2

	78 #define size r7

	79 #define step r8

	80 #define step1 r9

	81 #define twStep r10

	82 #define pTwiddleTmp r11

	83 #define argTwiddle1 r12

	84 #define zero r14

	85

	86 @// Neon registers

	87

	88 #define dX0 D0.F32

	89 #define dShift D1.F32

	90 #define dX1 D1.F32

	91 #define dY0 D2.F32

	92 #define dY1 D3.F32

	93 #define dX0r D0.F32

	94 #define dX0i D1.F32

	95 #define dX1r D2.F32

	96 #define dX1i D3.F32

	97 #define dW0r D4.F32

	98 #define dW0i D5.F32

	99 #define dW1r D6.F32

	100 #define dW1i D7.F32

	101 #define dT0 D8.F32

	102 #define dT1 D9.F32

	103 #define dT2 D10.F32

	104 #define dT3 D11.F32

	105 #define qT0 D12.F32

	106 #define qT1 D14.F32

	107 #define qT2 D16.F32

	108 #define qT3 D18.F32

	109 #define dY0r D4.F32

	110 #define dY0i D5.F32

	111 #define dY1r D6.F32

	112 #define dY1i D7.F32

	113

	114 #define dY2 D4.F32

	115 #define dY3 D5.F32

	116 #define dW0 D6.F32

	117 #define dW1 D7.F32

	118 #define dW0Tmp D10.F32

	119 #define dW1Neg D11.F32

	120

	121 #define half D13.F32

	122

	123 @ Structure offsets for the FFTSpec

	124 .set ARMsFFTSpec_N, 0

	125 .set ARMsFFTSpec_pBitRev, 4

	126 .set ARMsFFTSpec_pTwiddle, 8

	127 .set ARMsFFTSpec_pBuf, 12

	128

	129 .MACRO FFTSTAGE scaled, inverse, name

	130

	131 @// Read the size from structure and take log

	132 LDR N, [pFFTSpec, #ARMsFFTSpec_N]

	133

	134 @// Read other structure parameters

	135 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]

	136 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

	137

	138 VMOV half, 0.5

	139

	140

	141 MOV size,N,ASR #1 @// preserve the contents of N

	142 MOV step,N,LSL #2 @// step = N/2 * 8 bytes

	143

	144

	145 @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}

	146 @// Note: W^(k) is stored as negated value and also need to

	147 @// conjugate the values from the table

	148

	149 @// Z(0) : no need of twiddle multiply

	150 @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }

	151

	152 VLD1 dX0,[pSrc],step

	153 ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes

	154

	155 VLD1 dX1,[pSrc]!

	156 @// twStep = 3N/8 * 8 bytes pointing to W^1

	157 SUB twStep,step,size,LSL #1

	158

	159 MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes

	160 SUB step1,step1,#8 @// (N/4-1)*8 bytes

	161

	162 VADD dY0,dX0,dX1 @// [b+d \| a+c]

	163 VSUB dY1,dX0,dX1 @// [b-d \| a-c]

	164 VMUL dY0, dY0, half[0]

	165 VMUL dY1, dY1, half[0]

	166

	167 @// dY0= [a-c \| a+c] ;dY1= [b-d \| b+d]

	168 VZIP dY0,dY1

	169

	170 VSUB dX0,dY0,dY1

	171 SUBS size,size,#2

	172 VADD dX1,dY0,dY1

	173

	174 SUB pSrc,pSrc,step

	175

	176 VST1 dX0[0],[pOut1]!

	177 ADD pTwiddleTmp,pTwiddle,#8 @// W^2

	178 VST1 dX1[1],[pOut1]!

	179 ADD argTwiddle1,pTwiddle,twStep @// W^1

	180

	181

	182 BLT decrementScale\name

	183 BEQ lastElement\name

	184

	185

	186 @// Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]

	187 @// Note: W^k is stored as negative values in the table and also

	188 @// need to conjugate the values from the table.

	189 @//

	190 @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)

	191 @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)

	192

	193

	194 SUB step,step,#24

	195 evenOddButterflyLoop\name :

	196

	197

	198 VLD1 dW0r,[argTwiddle1],step1

	199 VLD1 dW1r,[argTwiddle1]!

	200

	201 VLD2 {dX0r,dX0i},[pSrc],step

	202 SUB argTwiddle1,argTwiddle1,step1

	203 VLD2 {dX1r,dX1i},[pSrc]!

	204

	205 SUB step1,step1,#8 @// (N/4-2)*8 bytes

	206 VLD1 dW0i,[pTwiddleTmp],step1

	207 VLD1 dW1i,[pTwiddleTmp]!

	208 SUB pSrc,pSrc,step

	209

	210 SUB pTwiddleTmp,pTwiddleTmp,step1

	211 VREV64 dX1r,dX1r

	212 VREV64 dX1i,dX1i

	213 SUBS size,size,#4

	214

	215

	216 VSUB dT2,dX0r,dX1r @// a-c

	217 VADD dT3,dX0i,dX1i @// b+d

	218 VADD dT0,dX0r,dX1r @// a+c

	219 VSUB dT1,dX0i,dX1i @// b-d

	220 SUB step1,step1,#8

	221

	222 VMUL dT2, dT2, half[0]

	223 VMUL dT3, dT3, half[0]

	224

	225 VMUL dT0, dT0, half[0]

	226 VMUL dT1, dT1, half[0]

	227

	228 VZIP dW1r,dW1i

	229 VZIP dW0r,dW0i

	230

	231

	232 VMUL dX1r,dW1r,dT2

	233 VMUL dX1i,dW1r,dT3

	234 VMUL dX0r,dW0r,dT2

	235 VMUL dX0i,dW0r,dT3

	236

	237 VMLS dX1r,dW1i,dT3

	238 VMLA dX1i,dW1i,dT2

	239

	240 VMLA dX0r,dW0i,dT3

	241 VMLS dX0i,dW0i,dT2

	242

	243

	244 VADD dY1r,dT0,dX1i @// F(N/2 -1)

	245 VSUB dY1i,dX1r,dT1

	246

	247 VREV64 dY1r,dY1r

	248 VREV64 dY1i,dY1i

	249

	250

	251 VADD dY0r,dT0,dX0i @// F(1)

	252 VSUB dY0i,dT1,dX0r

	253

	254

	255 VST2 {dY0r,dY0i},[pOut1],step

	256 VST2 {dY1r,dY1i},[pOut1]!

	257 SUB pOut1,pOut1,step

	258 SUB step,step,#32 @// (N/2-4)*8 bytes

	259

	260

	261 BGT evenOddButterflyLoop\name

	262

	263

	264 @// set both the ptrs to the last element

	265 SUB pSrc,pSrc,#8

	266 SUB pOut1,pOut1,#8

	267

	268 @// Last element can be expanded as follows

	269 @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as

	270 @// -ve)

	271 @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]

	272 @// 1/2[2a+j0] - j (c-jd) [0+j2b]

	273 @// (a+bc, -bd)

	274 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)

	275

	276 lastElement\name :

	277 VLD1 dX0r,[pSrc]

	278

	279 VST1 dX0r[0],[pOut1]!

	280 VNEG dX0r,dX0r

	281 VST1 dX0r[1],[pOut1]

	282

	283

	284

	285 decrementScale\name :

	286

	287 .endm

	288

	289 M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4

	290

	291 FFTSTAGE "FALSE","TRUE",Inv

	292 M_END

	293

	294 .end

OLD	NEW