third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S - Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S ('k') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S » ('j') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 @//

	2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.

	3 @//

	4 @// Use of this source code is governed by a BSD-style license

	5 @// that can be found in the LICENSE file in the root of the source

	6 @// tree. An additional intellectual property rights grant can be found

	7 @// in the file PATENTS. All contributing project authors may

	8 @// be found in the AUTHORS file in the root of the source tree.

	9 @//

	10 @// This file was originally licensed as follows. It has been

	11 @// relicensed with permission from the copyright holders.

	12 @//

	13

	14 @//

	15 @// File Name: armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s

	16 @// OpenMAX DL: v1.0.2

	17 @// Last Modified Revision: 7485

	18 @// Last Modified Date: Fri, 21 Sep 2007

	19 @//

	20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.

	21 @//

	22 @//

	23 @//

	24 @// Description:

	25 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT

	26 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation

	27 @// It implements both "scaled"(by 1/2) and "unsclaed" versions of the above for mula

	28 @//

	29

	30

	31 @// Include standard headers

	32

	33 #include "dl/api/armCOMM_s.h"

	34 #include "dl/api/omxtypes_s.h"

	35

	36

	37 @// Import symbols required from other files

	38 @// (For example tables)

	39

	40

	41 @// Set debugging level

	42 @//DEBUG_ON SETL {TRUE}

	43

	44

	45

	46 @// Guarding implementation by the processor name

	47

	48

	49

	50 @// Guarding implementation by the processor name

	51

	52

	53

	54 @//Input Registers

	55

	56 #define pSrc r0

	57 #define pDst r1

	58 #define pFFTSpec r2

	59 #define scale r3

	60

	61

	62 @// Output registers

	63 #define result r0

	64

	65 @//Local Scratch Registers

	66

	67 #define argTwiddle r1

	68 #define argDst r2

	69 #define argScale r4

	70 #define tmpOrder r4

	71 #define pTwiddle r4

	72 #define pOut r5

	73 #define subFFTSize r7

	74 #define subFFTNum r6

	75 #define N r6

	76 #define order r14

	77 #define diff r9

	78 #define count r8 @// Total num of radix stages requi red to comple the FFT

	79 #define x0r r4

	80 #define x0i r5

	81 #define diffMinusOne r2

	82 #define round r3

	83

	84 #define pOut1 r2

	85 #define size r7

	86 #define step r8

	87 #define step1 r9

	88 #define twStep r10

	89 #define pTwiddleTmp r11

	90 #define argTwiddle1 r12

	91 #define zero r14

	92

	93 @// Neon registers

	94

	95 #define dX0 D0.S32

	96 #define dShift D1.S32

	97 #define dX1 D1.S32

	98 #define dY0 D2.S32

	99 #define dY1 D3.S32

	100 #define dX0r D0.S32

	101 #define dX0i D1.S32

	102 #define dX1r D2.S32

	103 #define dX1i D3.S32

	104 #define dW0r D4.S32

	105 #define dW0i D5.S32

	106 #define dW1r D6.S32

	107 #define dW1i D7.S32

	108 #define dT0 D8.S32

	109 #define dT1 D9.S32

	110 #define dT2 D10.S32

	111 #define dT3 D11.S32

	112 #define qT0 Q6.S64

	113 #define qT1 Q7.S64

	114 #define qT2 Q8.S64

	115 #define qT3 Q9.S64

	116 #define dY0r D4.S32

	117 #define dY0i D5.S32

	118 #define dY1r D6.S32

	119 #define dY1i D7.S32

	120

	121 #define dY2 D4.S32

	122 #define dY3 D5.S32

	123 #define dW0 D6.S32

	124 #define dW1 D7.S32

	125 #define dW0Tmp D10.S32

	126 #define dW1Neg D11.S32

	127

	128

	129 @ Structure offsets for the FFTSpec

	130 .set ARMsFFTSpec_N, 0

	131 .set ARMsFFTSpec_pBitRev, 4

	132 .set ARMsFFTSpec_pTwiddle, 8

	133 .set ARMsFFTSpec_pBuf, 12

	134

	135

	136 .MACRO FFTSTAGE scaled, inverse, name

	137

	138 @// Read the size from structure and take log

	139 LDR N, [pFFTSpec, #ARMsFFTSpec_N]

	140

	141 @// Read other structure parameters

	142 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]

	143 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

	144

	145

	146

	147 MOV size,N,ASR #1 @// preserve the contents of N

	148 MOV step,N,LSL #2 @// step = N/2 * 8 bytes

	149

	150

	151 @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}

	152 @// Note: W^(k) is stored as negated value and also need to conjugate th e values from the table

	153

	154 @// Z(0) : no need of twiddle multiply

	155 @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }

	156

	157 VLD1 dX0,[pSrc],step

	158 ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes

	159

	160 VLD1 dX1,[pSrc]!

	161 SUB twStep,step,size,LSL #1 @// twStep = 3N/8 * 8 bytes poi nting to W^1

	162

	163 MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 byt es

	164 SUB step1,step1,#8 @// (N/4-1)*8 bytes

	165

	166 VHADD dY0,dX0,dX1 @// [b+d \| a+c]

	167 VHSUB dY1,dX0,dX1 @// [b-d \| a-c]

	168 VZIP dY0,dY1 @// dY0= [a-c \| a+c] ;dY1= [b-d \| b+d]

	169

	170 .ifeqs "\scaled", "TRUE"

	171 VHSUB dX0,dY0,dY1

	172 SUBS size,size,#2

	173 VHADD dX1,dY0,dY1

	174 .else

	175 VSUB dX0,dY0,dY1

	176 SUBS size,size,#2

	177 VADD dX1,dY0,dY1

	178 .endif

	179

	180 SUB pSrc,pSrc,step

	181

	182 VST1 dX0[0],[pOut1]!

	183 ADD pTwiddleTmp,pTwiddle,#8 @// W^2

	184 VST1 dX1[1],[pOut1]!

	185 ADD argTwiddle1,pTwiddle,twStep @// W^1

	186

	187

	188 BLT decrementScale\name

	189 BEQ lastElement\name

	190

	191

	192 @// Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]

	193 @// Note: W^k is stored as negative values in the table and also need to conjugate the values from the table

	194 @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) s ince both of them

	195 @// require F(1),F(2) and F(N/2-2),F(N/2-1)

	196

	197

	198 SUB step,step,#24

	199 evenOddButterflyLoop\name :

	200

	201

	202 VLD1 dW0r,[argTwiddle1],step1

	203 VLD1 dW1r,[argTwiddle1]!

	204

	205 VLD2 {dX0r,dX0i},[pSrc],step

	206 SUB argTwiddle1,argTwiddle1,step1

	207 VLD2 {dX1r,dX1i},[pSrc]!

	208

	209 SUB step1,step1,#8 @// (N/4-2)*8 bytes

	210 VLD1 dW0i,[pTwiddleTmp],step1

	211 VLD1 dW1i,[pTwiddleTmp]!

	212 SUB pSrc,pSrc,step

	213

	214 SUB pTwiddleTmp,pTwiddleTmp,step1

	215 VREV64 dX1r,dX1r

	216 VREV64 dX1i,dX1i

	217 SUBS size,size,#4

	218

	219

	220 VHSUB dT2,dX0r,dX1r @// a-c

	221 VHADD dT3,dX0i,dX1i @// b+d

	222 SUB step1,step1,#8

	223 VHADD dT0,dX0r,dX1r @// a+c

	224 VHSUB dT1,dX0i,dX1i @// b-d

	225

	226 VZIP dW1r,dW1i

	227 VZIP dW0r,dW0i

	228

	229

	230 VMULL qT0,dW1r,dT2

	231 VMLSL qT0,dW1i,dT3

	232 VMULL qT1,dW1r,dT3

	233 VMLAL qT1,dW1i,dT2

	234

	235 VMULL qT2,dW0r,dT2

	236 VMLAL qT2,dW0i,dT3

	237 VMULL qT3,dW0r,dT3

	238 VMLSL qT3,dW0i,dT2

	239

	240

	241 VRSHRN dX1r,qT0,#31

	242 VRSHRN dX1i,qT1,#31

	243

	244 .ifeqs "\scaled", "TRUE"

	245 VHADD dY1r,dT0,dX1i @// F(N/2 -1)

	246 VHSUB dY1i,dX1r,dT1

	247 .else

	248 VADD dY1r,dT0,dX1i @// F(N/2 -1)

	249 VSUB dY1i,dX1r,dT1

	250

	251 .endif

	252

	253

	254 VREV64 dY1r,dY1r

	255 VREV64 dY1i,dY1i

	256

	257

	258 VRSHRN dX0r,qT2,#31

	259 VRSHRN dX0i,qT3,#31

	260

	261 .ifeqs "\scaled", "TRUE"

	262 VHADD dY0r,dT0,dX0i @// F(1)

	263 VHSUB dY0i,dT1,dX0r

	264 .else

	265 VADD dY0r,dT0,dX0i @// F(1)

	266 VSUB dY0i,dT1,dX0r

	267 .endif

	268

	269

	270 VST2 {dY0r,dY0i},[pOut1],step

	271 VST2 {dY1r,dY1i},[pOut1]!

	272 SUB pOut1,pOut1,step

	273 SUB step,step,#32 @// (N/2-4)*8 bytes

	274

	275

	276 BGT evenOddButterflyLoop\name

	277

	278

	279 SUB pSrc,pSrc,#8 @// set both the ptrs to the last el ement

	280 SUB pOut1,pOut1,#8

	281

	282 @// Last element can be expanded as follows

	283 @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as -v e)

	284 @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]

	285 @// 1/2[2a+j0] - j (c-jd) [0+j2b]

	286 @// (a+bc, -bd)

	287 @// Since (c,d) = (0,1) for the last element, result is just (a,-b)

	288

	289 lastElement\name :

	290 VLD1 dX0r,[pSrc]

	291

	292 .ifeqs "\scaled", "TRUE"

	293 VSHR dX0r,dX0r,#1

	294 .endif

	295

	296 VST1 dX0r[0],[pOut1]!

	297 VNEG dX0r,dX0r

	298 VST1 dX0r[1],[pOut1]

	299

	300

	301

	302 decrementScale\name :

	303

	304 .ifeqs "\scaled", "TRUE"

	305 SUB scale,scale,#1

	306 .endif

	307

	308 .endm

	309

	310 M_START armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe,r4

	311

	312 FFTSTAGE "FALSE","TRUE",Inv

	313 M_END

	314

	315 M_START armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe,r4

	316

	317 FFTSTAGE "TRUE","TRUE",InvSfs

	318 M_END

	319

	320

	321 .end

OLD	NEW