third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S - Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED

Side by Side Diff: third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S

Issue 12317152: Add openmax dl routines for review. MUST NOT BE LANDED (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S ('K') | « third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S ('k') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S » ('j') | third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 @//

	2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.

	3 @//

	4 @// Use of this source code is governed by a BSD-style license

	5 @// that can be found in the LICENSE file in the root of the source

	6 @// tree. An additional intellectual property rights grant can be found

	7 @// in the file PATENTS. All contributing project authors may

	8 @// be found in the AUTHORS file in the root of the source tree.

	9 @//

	10 @// This file was originally licensed as follows. It has been

	11 @// relicensed with permission from the copyright holders.

	12

	13 @//

	14 @//

	15 @// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s

	16 @// OpenMAX DL: v1.0.2

	17 @// Last Modified Revision: 7761

	18 @// Last Modified Date: Wed, 26 Sep 2007

	19 @//

	20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.

	21 @//

	22 @//

	23 @//

	24 @// Description:

	25 @// Compute a Radix 4 FFT stage for a N point complex signal

	26 @//

	27 @//

	28

	29

	30 @// Include standard headers

	31

	32 #include "dl/api/armCOMM_s.h"

	33 #include "dl/api/omxtypes_s.h"

	34

	35

	36

	37 @// Import symbols required from other files

	38 @// (For example tables)

	39

	40

	41

	42

	43 @// Set debugging level

	44 @//DEBUG_ON SETL {TRUE}

	45

	46

	47 @// Guarding implementation by the processor name

	48

	49

	50

	51 @// Guarding implementation by the processor name

	52

	53

	54 @// Import symbols required from other files

	55 @// (For example tables)

	56

	57

	58 @//Input Registers

	59

	60 #define pSrc r0

	61 #define pDst r2

	62 #define pTwiddle r1

	63 #define subFFTNum r6

	64 #define subFFTSize r7

	65

	66

	67

	68 @//Output Registers

	69

	70

	71 @//Local Scratch Registers

	72

	73 #define grpCount r3

	74 #define pointStep r4

	75 #define outPointStep r5

	76 #define stepTwiddle r12

	77 #define setCount r14

	78 #define srcStep r8

	79 #define setStep r9

	80 #define dstStep r10

	81 #define twStep r11

	82 #define t1 r3

	83

	84 @// Neon Registers

	85

	86 #define dW1 D0.S16

	87 #define dW2 D1.S16

	88 #define dW3 D2.S16

	89

	90 #define dXr0 D4.S16

	91 #define dXi0 D5.S16

	92 #define dXr1 D6.S16

	93 #define dXi1 D7.S16

	94 #define dXr2 D8.S16

	95 #define dXi2 D9.S16

	96 #define dXr3 D10.S16

	97 #define dXi3 D11.S16

	98 #define dYr0 D12.S16

	99 #define dYi0 D13.S16

	100 #define dYr1 D14.S16

	101 #define dYi1 D15.S16

	102 #define dYr2 D16.S16

	103 #define dYi2 D17.S16

	104 #define dYr3 D18.S16

	105 #define dYi3 D19.S16

	106 #define qT0 Q8.S32

	107 #define qT1 Q9.S32

	108 #define qT2 Q6.S32

	109 #define qT3 Q7.S32

	110

	111 #define dZr0 D20.S16

	112 #define dZi0 D21.S16

	113 #define dZr1 D22.S16

	114 #define dZi1 D23.S16

	115 #define dZr2 D24.S16

	116 #define dZi2 D25.S16

	117 #define dZr3 D26.S16

	118 #define dZi3 D27.S16

	119 #define qY0 Q6.S16

	120 #define qY1 Q7.S16

	121 #define qY2 Q8.S16

	122 #define qY3 Q9.S16

	123 #define qX0 Q2.S16

	124 #define qZ0 Q10.S16

	125 #define qZ1 Q11.S16

	126 #define qZ2 Q12.S16

	127 #define qZ3 Q13.S16

	128

	129

	130 .MACRO FFTSTAGE scaled, inverse , name

	131

	132 @// Define stack arguments

	133

	134

	135 @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs

	136

	137 LSL grpCount,subFFTSize,#2

	138 LSR subFFTNum,subFFTNum,#2

	139 MOV subFFTSize,grpCount

	140

	141

	142 @// pOut0+1 increments pOut0 by 4 bytes

	143 @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes

	144

	145 MOV stepTwiddle,#0

	146 SMULBB outPointStep,grpCount,subFFTNum

	147

	148 @// pT0+1 increments pT0 by 4 bytes

	149 @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes

	150

	151 LSL pointStep,subFFTNum,#2 @// 2*grpSize

	152

	153 VLD1 dW1,[pTwiddle :64] @//[wi \| wr]

	154 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*poin tStep

	155 VLD1 dW2,[pTwiddle :64] @//[wi \| wr]

	156 ADD setStep,srcStep,pointStep @// setStep = 3*poin tStep

	157 SUB srcStep,srcStep,#16 @// srcStep = 2*poin tStep-16

	158 VLD1 dW3,[pTwiddle :64]

	159 @//RSB setStep,setStep,#16 @// setStep = - 3*po intStep+16

	160 RSB setStep,setStep,#0 @// setStep = - 3*po intStep

	161

	162 MOV dstStep,outPointStep,LSL #1

	163 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outP ointStep

	164 RSB dstStep,dstStep,#16 @// dstStep = - 3*ou tPointStep+16

	165

	166

	167

	168 grpLoop\name:

	169

	170 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]

	171 ADD stepTwiddle,stepTwiddle,pointStep

	172 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]

	173 ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point

	174 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]

	175 MOV twStep,stepTwiddle,LSL #2

	176 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc

	177

	178 SUB twStep,stepTwiddle,twStep @// twStep = -3*ste pTwiddle

	179

	180

	181 MOV setCount,pointStep,LSR #2

	182 ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set

	183 ADD pSrc,pSrc,pointStep @// increment to data[1] o f the next set

	184

	185 @// Loop on the sets : 4 at a time

	186

	187 setLoop\name:

	188

	189 SUBS setCount,setCount,#4 @// decrement the loop c ounter

	190

	191 .ifeqs "\inverse", "TRUE"

	192 VMULL qT0,dXr1,dW1[0]

	193 VMLAL qT0,dXi1,dW1[1] @// real part

	194 VMULL qT1,dXi1,dW1[0]

	195 VMLSL qT1,dXr1,dW1[1] @// imag part

	196

	197 .ELSE

	198 VMULL qT0,dXr1,dW1[0]

	199 VMLSL qT0,dXi1,dW1[1] @// real part

	200 VMULL qT1,dXi1,dW1[0]

	201 VMLAL qT1,dXr1,dW1[1] @// imag part

	202

	203 .ENDIF

	204

	205 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]

	206

	207 .ifeqs "\inverse", "TRUE"

	208 VMULL qT2,dXr2,dW2[0]

	209 VMLAL qT2,dXi2,dW2[1] @// real part

	210 VMULL qT3,dXi2,dW2[0]

	211 VMLSL qT3,dXr2,dW2[1] @// imag part

	212

	213 .ELSE

	214 VMULL qT2,dXr2,dW2[0]

	215 VMLSL qT2,dXi2,dW2[1] @// real part

	216 VMULL qT3,dXi2,dW2[0]

	217 VMLAL qT3,dXr2,dW2[1] @// imag part

	218

	219 .ENDIF

	220

	221 VRSHRN dZr1,qT0,#15

	222 VRSHRN dZi1,qT1,#15

	223

	224

	225 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]

	226

	227 .ifeqs "\inverse", "TRUE"

	228 VMULL qT0,dXr3,dW3[0]

	229 VMLAL qT0,dXi3,dW3[1] @// real part

	230 VMULL qT1,dXi3,dW3[0]

	231 VMLSL qT1,dXr3,dW3[1] @// imag part

	232

	233 .ELSE

	234 VMULL qT0,dXr3,dW3[0]

	235 VMLSL qT0,dXi3,dW3[1] @// real part

	236 VMULL qT1,dXi3,dW3[0]

	237 VMLAL qT1,dXr3,dW3[1] @// imag part

	238

	239 .ENDIF

	240

	241 VRSHRN dZr2,qT2,#15

	242 VRSHRN dZi2,qT3,#15

	243

	244

	245 VRSHRN dZr3,qT0,#15

	246 VRSHRN dZi3,qT1,#15

	247 VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
	aedla 2013/06/26 20:24:46 16 byte OOB read when both loops are at their last 16 byte OOB read when both loops are at their last iteration.
	248

	249

	250 .ifeqs "\scaled", "TRUE"

	251

	252 @// finish first stage of 4 point FFT

	253 VHADD qY0,qX0,qZ2

	254 VHSUB qY2,qX0,qZ2

	255

	256 VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0]

	257 VHADD qY1,qZ1,qZ3

	258 VHSUB qY3,qZ1,qZ3

	259

	260

	261 @// finish second stage of 4 point FFT

	262

	263 .ifeqs "\inverse", "TRUE"

	264

	265 VHSUB qZ0,qY2,qY1

	266

	267 VHADD dZr2,dYr0,dYi3

	268 VST2 {dZr0,dZi0},[pDst :128],outPointStep

	269 VHSUB dZi2,dYi0,dYr3

	270

	271 VHADD qZ1,qY2,qY1

	272 VST2 {dZr2,dZi2},[pDst :128],outPointStep

	273

	274 VHSUB dZr3,dYr0,dYi3

	275 VST2 {dZr1,dZi1},[pDst :128],outPointStep

	276 VHADD dZi3,dYi0,dYr3

	277 VST2 {dZr3,dZi3},[pDst :128],dstStep

	278

	279

	280 .ELSE

	281

	282 VHSUB qZ0,qY2,qY1

	283

	284 VHSUB dZr3,dYr0,dYi3

	285 VST2 {dZr0,dZi0},[pDst :128],outPointStep

	286 VHADD dZi3,dYi0,dYr3

	287

	288 VHADD qZ1,qY2,qY1

	289 VST2 {dZr3,dZi3},[pDst :128],outPointStep

	290

	291 VHADD dZr2,dYr0,dYi3

	292 VHSUB dZi2,dYi0,dYr3

	293 VST2 {dZr1,dZi1},[pDst :128],outPointStep

	294 VST2 {dZr2,dZi2},[pDst :128],dstStep

	295

	296

	297 .ENDIF

	298

	299

	300 .ELSE

	301

	302 @// finish first stage of 4 point FFT

	303 VADD qY0,qX0,qZ2

	304 VSUB qY2,qX0,qZ2

	305

	306 VLD2 {dXr0,dXi0},[pSrc]! @// data[0]

	307 VADD qY1,qZ1,qZ3

	308 VSUB qY3,qZ1,qZ3

	309

	310

	311 @// finish second stage of 4 point FFT

	312

	313

	314 .ifeqs "\inverse", "TRUE"

	315

	316 VSUB qZ0,qY2,qY1

	317

	318 VADD dZr2,dYr0,dYi3

	319 VST2 {dZr0,dZi0},[pDst :128],outPointStep

	320 VSUB dZi2,dYi0,dYr3

	321

	322 VADD qZ1,qY2,qY1

	323 VST2 {dZr2,dZi2},[pDst :128],outPointStep

	324

	325 VSUB dZr3,dYr0,dYi3

	326 VST2 {dZr1,dZi1},[pDst :128],outPointStep

	327 VADD dZi3,dYi0,dYr3

	328 VST2 {dZr3,dZi3},[pDst :128],dstStep

	329

	330

	331 .ELSE

	332

	333 VSUB qZ0,qY2,qY1

	334

	335 VSUB dZr3,dYr0,dYi3

	336 VST2 {dZr0,dZi0},[pDst :128],outPointStep

	337 VADD dZi3,dYi0,dYr3

	338

	339 VADD qZ1,qY2,qY1

	340 VST2 {dZr3,dZi3},[pDst :128],outPointStep

	341

	342 VADD dZr2,dYr0,dYi3

	343 VSUB dZi2,dYi0,dYr3

	344 VST2 {dZr1,dZi1},[pDst :128],outPointStep

	345 VST2 {dZr2,dZi2},[pDst :128],dstStep

	346

	347

	348 .ENDIF

	349

	350

	351

	352 .ENDIF

	353

	354 ADD pSrc,pSrc,pointStep @// increment to dat a[1] of the next set

	355 BGT setLoop\name

	356

	357 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi \| wr]

	358 SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4

	359 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi \| wr]

	360 ADD pSrc,pSrc,srcStep @// increment pSrc f or the next grp

	361 VLD1 dW3,[pTwiddle :64],twStep @//[wi \| wr]
	aedla 2013/06/26 20:24:46 8 byte OOB read at the last iteration, coming from 8 byte OOB read at the last iteration, coming from the ping pong buffer and is not used, so no impact really. IIRC we didn't bother to fix it in FC32.
	362

	363

	364

	365 BGT grpLoop\name

	366

	367

	368 @// Reset and Swap pSrc and pDst for the next stage

	369 MOV t1,pDst

	370 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc - = 4*size bytes

	371 SUB pSrc,t1,outPointStep

	372

	373

	374 .endm

	375

	376

	377 M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4

	378 FFTSTAGE "FALSE","FALSE",FWD

	379 M_END

	380

	381

	382 M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4

	383 FFTSTAGE "FALSE","TRUE",INV

	384 M_END

	385

	386

	387 M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4

	388 FFTSTAGE "TRUE","FALSE",FWDSFS

	389 M_END

	390

	391

	392 M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4

	393 FFTSTAGE "TRUE","TRUE",INVSFS

	394 M_END

	395

	396

	397

	398

	399

	400 .END

OLD	NEW