third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S - Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S ('k') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S » ('j') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 @//

	2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.

	3 @//

	4 @// Use of this source code is governed by a BSD-style license

	5 @// that can be found in the LICENSE file in the root of the source

	6 @// tree. An additional intellectual property rights grant can be found

	7 @// in the file PATENTS. All contributing project authors may

	8 @// be found in the AUTHORS file in the root of the source tree.

	9 @//

	10 @//

	11 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s

	12 @// to support float instead of SC32.

	13 @//

	14

	15 @//

	16 @// Description:

	17 @// Compute a Radix 4 FFT stage for a N point complex signal

	18 @//

	19 @//

	20

	21

	22 @// Include standard headers

	23

	24 #include "dl/api/armCOMM_s.h"

	25 #include "dl/api/omxtypes_s.h"

	26

	27

	28 @// Import symbols required from other files

	29 @// (For example tables)

	30

	31

	32

	33

	34 @// Set debugging level

	35 @//DEBUG_ON SETL {TRUE}

	36

	37

	38

	39 @// Guarding implementation by the processor name

	40

	41

	42

	43

	44 @// Guarding implementation by the processor name

	45

	46

	47 @// Import symbols required from other files

	48 @// (For example tables)

	49

	50

	51 @//Input Registers

	52

	53 #define pSrc r0

	54 #define pDst r2

	55 #define pTwiddle r1

	56 #define subFFTNum r6

	57 #define subFFTSize r7

	58

	59

	60

	61 @//Output Registers

	62

	63

	64 @//Local Scratch Registers

	65

	66 #define grpCount r3

	67 #define pointStep r4

	68 #define outPointStep r5

	69 #define stepTwiddle r12

	70 #define setCount r14

	71 #define srcStep r8

	72 #define setStep r9

	73 #define dstStep r10

	74 #define twStep r11

	75 #define t1 r3

	76

	77 @// Neon Registers

	78

	79 #define dW1 D0.F32

	80 #define dW2 D1.F32

	81 #define dW3 D2.F32

	82

	83 #define dXr0 D4.F32

	84 #define dXi0 D5.F32

	85 #define dXr1 D6.F32

	86 #define dXi1 D7.F32

	87 #define dXr2 D8.F32

	88 #define dXi2 D9.F32

	89 #define dXr3 D10.F32

	90 #define dXi3 D11.F32

	91 #define dYr0 D12.F32

	92 #define dYi0 D13.F32

	93 #define dYr1 D14.F32

	94 #define dYi1 D15.F32

	95 #define dYr2 D16.F32

	96 #define dYi2 D17.F32

	97 #define dYr3 D18.F32

	98 #define dYi3 D19.F32

	99 #define qT0 d16.f32

	100 #define qT1 d18.f32

	101 #define qT2 d12.f32

	102 #define qT3 d14.f32

	103 #define dZr0 D20.F32

	104 #define dZi0 D21.F32

	105 #define dZr1 D22.F32

	106 #define dZi1 D23.F32

	107 #define dZr2 D24.F32

	108 #define dZi2 D25.F32

	109 #define dZr3 D26.F32

	110 #define dZi3 D27.F32

	111

	112 #define qY0 Q6.F32

	113 #define qY1 Q7.F32

	114 #define qY2 Q8.F32

	115 #define qY3 Q9.F32

	116 #define qX0 Q2.F32

	117 #define qZ0 Q10.F32

	118 #define qZ1 Q11.F32

	119 #define qZ2 Q12.F32

	120 #define qZ3 Q13.F32

	121

	122 .MACRO FFTSTAGE scaled, inverse , name

	123

	124 @// Define stack arguments

	125

	126

	127 @// Update grpCount and grpSize rightaway inorder to reuse

	128 @// pGrpCount and pGrpSize regs

	129

	130 LSL grpCount,subFFTSize,#2

	131 LSR subFFTNum,subFFTNum,#2

	132 MOV subFFTSize,grpCount

	133

	134 VLD1 dW1,[pTwiddle] @//[wi \| wr]

	135 @// pT0+1 increments pT0 by 8 bytes

	136 @// pT0+pointStep = increment of 8pointStep bytes = 2grpSize bytes

	137 MOV pointStep,subFFTNum,LSL #1

	138

	139

	140 @// pOut0+1 increments pOut0 by 8 bytes

	141 @// pOut0+outPointStep == increment of 8*outPointStep bytes

	142 @// = 2*size bytes

	143

	144 MOV stepTwiddle,#0

	145 VLD1 dW2,[pTwiddle] @//[wi \| wr]

	146 SMULBB outPointStep,grpCount,pointStep

	147 LSL pointStep,pointStep,#2 @// 2*grpSize

	148

	149 VLD1 dW3,[pTwiddle] @//[wi \| wr]

	150 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep

	151 ADD setStep,srcStep,pointStep @// setStep = 3*pointStep

	152

	153 RSB setStep,setStep,#0 @// setStep = - 3*pointStep

	154 SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16

	155

	156 MOV dstStep,outPointStep,LSL #1

	157 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep

	158 @// dstStep = - 3*outPointStep+16

	159 RSB dstStep,dstStep,#16

	160

	161

	162

	163 radix4GrpLoop\name :

	164

	165 VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0]

	166 ADD stepTwiddle,stepTwiddle,pointStep

	167 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1]

	168 @// set pTwiddle to the first point

	169 ADD pTwiddle,pTwiddle,stepTwiddle

	170 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2]

	171 MOV twStep,stepTwiddle,LSL #2

	172

	173 @// data[3] & update pSrc for the next set

	174 VLD2 {dXr3,dXi3},[pSrc],setStep

	175 SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle

	176

	177 MOV setCount,pointStep,LSR #3

	178 @// set pSrc to data[0] of the next set

	179 ADD pSrc,pSrc,#16

	180 @// increment to data[1] of the next set

	181 ADD pSrc,pSrc,pointStep

	182

	183

	184 @// Loop on the sets

	185

	186 radix4SetLoop\name :

	187

	188

	189

	190 .ifeqs "\inverse", "TRUE"

	191 VMUL dZr1,dXr1,dW1[0]

	192 VMUL dZi1,dXi1,dW1[0]

	193 VMUL dZr2,dXr2,dW2[0]

	194 VMUL dZi2,dXi2,dW2[0]

	195 VMUL dZr3,dXr3,dW3[0]

	196 VMUL dZi3,dXi3,dW3[0]

	197

	198 VMLA dZr1,dXi1,dW1[1] @// real part

	199 VMLS dZi1,dXr1,dW1[1] @// imag part

	200

	201 @// data[1] for next iteration

	202 VLD2 {dXr1,dXi1},[pSrc],pointStep

	203

	204 VMLA dZr2,dXi2,dW2[1] @// real part

	205 VMLS dZi2,dXr2,dW2[1] @// imag part

	206

	207 @// data[2] for next iteration

	208 VLD2 {dXr2,dXi2},[pSrc],pointStep

	209

	210 VMLA dZr3,dXi3,dW3[1] @// real part

	211 VMLS dZi3,dXr3,dW3[1] @// imag part

	212 .else

	213 VMUL dZr1,dXr1,dW1[0]

	214 VMUL dZi1,dXi1,dW1[0]

	215 VMUL dZr2,dXr2,dW2[0]

	216 VMUL dZi2,dXi2,dW2[0]

	217 VMUL dZr3,dXr3,dW3[0]

	218 VMUL dZi3,dXi3,dW3[0]

	219

	220 VMLS dZr1,dXi1,dW1[1] @// real part

	221 VMLA dZi1,dXr1,dW1[1] @// imag part

	222

	223 @// data[1] for next iteration

	224 VLD2 {dXr1,dXi1},[pSrc],pointStep

	225

	226 VMLS dZr2,dXi2,dW2[1] @// real part

	227 VMLA dZi2,dXr2,dW2[1] @// imag part

	228

	229 @// data[2] for next iteration

	230 VLD2 {dXr2,dXi2},[pSrc],pointStep

	231

	232 VMLS dZr3,dXi3,dW3[1] @// real part

	233 VMLA dZi3,dXr3,dW3[1] @// imag part

	234 .endif

	235

	236 @// data[3] & update pSrc to data[0]

	237 @// But don't read on the very last iteration because that reads past

	238 @// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.

	239 cmp grpCount, #4

	240 cmpeq setCount, #2 @// Test setCount if grpCount = 4

	241 @// These are executed only if both grpCount = 4 and setCount = 2

	242 addeq pSrc, pSrc, setStep

	243 beq radix4SkipRead\name

	244 VLD2 {dXr3,dXi3},[pSrc],setStep

	245 radix4SkipRead\name:

	246 SUBS setCount,setCount,#2

	247

	248 @// finish first stage of 4 point FFT

	249 VADD qY0,qX0,qZ2

	250 VSUB qY2,qX0,qZ2

	251

	252 @// data[0] for next iteration

	253 VLD2 {dXr0,dXi0},[pSrc :128]!

	254 VADD qY1,qZ1,qZ3

	255 VSUB qY3,qZ1,qZ3

	256

	257 @// finish second stage of 4 point FFT

	258

	259 VSUB qZ0,qY2,qY1

	260

	261

	262 .ifeqs "\inverse", "TRUE"

	263

	264 VADD dZr3,dYr0,dYi3

	265 VST2 {dZr0,dZi0},[pDst :128],outPointStep

	266 VSUB dZi3,dYi0,dYr3

	267

	268 VADD qZ2,qY2,qY1

	269 VST2 {dZr3,dZi3},[pDst :128],outPointStep

	270

	271 VSUB dZr1,dYr0,dYi3

	272 VST2 {dZr2,dZi2},[pDst :128],outPointStep

	273 VADD dZi1,dYi0,dYr3

	274

	275 VST2 {dZr1,dZi1},[pDst :128],dstStep

	276

	277

	278 .else

	279

	280 VSUB dZr1,dYr0,dYi3

	281 VST2 {dZr0,dZi0},[pDst :128],outPointStep

	282 VADD dZi1,dYi0,dYr3

	283

	284 VADD qZ2,qY2,qY1

	285 VST2 {dZr1,dZi1},[pDst :128],outPointStep

	286

	287 VADD dZr3,dYr0,dYi3

	288 VST2 {dZr2,dZi2},[pDst :128],outPointStep

	289 VSUB dZi3,dYi0,dYr3

	290

	291 VST2 {dZr3,dZi3},[pDst :128],dstStep

	292

	293

	294 .endif

	295

	296 @// increment to data[1] of the next set

	297 ADD pSrc,pSrc,pointStep

	298 BGT radix4SetLoop\name

	299

	300

	301 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi \| wr]

	302 @// subtract 4 since grpCount multiplied by 4

	303 SUBS grpCount,grpCount,#4

	304 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi \| wr]

	305 @// increment pSrc for the next grp

	306 ADD pSrc,pSrc,srcStep

	307 VLD1 dW3,[pTwiddle :64],twStep @//[wi \| wr]

	308 BGT radix4GrpLoop\name

	309

	310

	311 @// Reset and Swap pSrc and pDst for the next stage

	312 MOV t1,pDst

	313 @// pDst -= 2size; pSrc -= 8size bytes

	314 SUB pDst,pSrc,outPointStep,LSL #2

	315 SUB pSrc,t1,outPointStep

	316

	317

	318 .endm

	319

	320

	321 M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4

	322 FFTSTAGE "FALSE","FALSE",FWD

	323 M_END

	324

	325

	326 M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4

	327 FFTSTAGE "FALSE","TRUE",INV

	328 M_END

	329

	330

	331 .end

OLD	NEW