celt/arm/celt_pitch_xcorr_arm.s - Issue 107243004: Updating Opus to release 1.1

Side by Side Diff: celt/arm/celt_pitch_xcorr_arm.s

Issue 107243004: Updating Opus to release 1.1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/opus

Patch Set: Created 7 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 ; Copyright (c) 2007-2008 CSIRO

	2 ; Copyright (c) 2007-2009 Xiph.Org Foundation

	3 ; Copyright (c) 2013 Parrot

	4 ; Written by Aurélien Zanelli

	5 ;

	6 ; Redistribution and use in source and binary forms, with or without

	7 ; modification, are permitted provided that the following conditions

	8 ; are met:

	9 ;

	10 ; - Redistributions of source code must retain the above copyright

	11 ; notice, this list of conditions and the following disclaimer.

	12 ;

	13 ; - Redistributions in binary form must reproduce the above copyright

	14 ; notice, this list of conditions and the following disclaimer in the

	15 ; documentation and/or other materials provided with the distribution.

	16 ;

	17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

	21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

	22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

	23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

	24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

	25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

	26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

	27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	28

	29 AREA \|.text\|, CODE, READONLY

	30

	31 GET celt/arm/armopts.s

	32

	33 IF OPUS_ARM_MAY_HAVE_EDSP

	34 EXPORT celt_pitch_xcorr_edsp

	35 ENDIF

	36

	37 IF OPUS_ARM_MAY_HAVE_NEON

	38 EXPORT celt_pitch_xcorr_neon

	39 ENDIF

	40

	41 IF OPUS_ARM_MAY_HAVE_NEON

	42

	43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3

	44 xcorr_kernel_neon PROC

	45 ; input:

	46 ; r3 = int len

	47 ; r4 = opus_val16 *x

	48 ; r5 = opus_val16 *y

	49 ; q0 = opus_val32 sum[4]

	50 ; output:

	51 ; q0 = opus_val32 sum[4]

	52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15

	53 ; internal usage:

	54 ; r12 = int j

	55 ; d3 = y_3\|y_2\|y_1\|y_0

	56 ; q2 = y_B\|y_A\|y_9\|y_8\|y_7\|y_6\|y_5\|y_4

	57 ; q3 = x_7\|x_6\|x_5\|x_4\|x_3\|x_2\|x_1\|x_0

	58 ; q8 = scratch

	59 ;

	60 ; Load y[0...3]

	61 ; This requires len>0 to always be valid (which we assert in the C code).

	62 VLD1.16 {d5}, [r5]!

	63 SUBS r12, r3, #8

	64 BLE xcorr_kernel_neon_process4

	65 ; Process 8 samples at a time.

	66 ; This loop loads one y value more than we actually need. Therefore we have to

	67 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid

	68 ; reading past the end of the array.

	69 xcorr_kernel_neon_process8

	70 ; This loop has 19 total instructions (10 cycles to issue, minimum), with

	71 ; - 2 cycles of ARM insrtuctions,

	72 ; - 10 cycles of load/store/byte permute instructions, and

	73 ; - 9 cycles of data processing instructions.

	74 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the

	75 ; latter two categories, meaning the whole loop should run in 10 cycles per

	76 ; iteration, barring cache misses.

	77 ;

	78 ; Load x[0...7]

	79 VLD1.16 {d6, d7}, [r4]!

	80 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get

	81 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.

	82 VAND d3, d5, d5

	83 SUBS r12, r12, #8

	84 ; Load y[4...11]

	85 VLD1.16 {d4, d5}, [r5]!

	86 VMLAL.S16 q0, d3, d6[0]

	87 VEXT.16 d16, d3, d4, #1

	88 VMLAL.S16 q0, d4, d7[0]

	89 VEXT.16 d17, d4, d5, #1

	90 VMLAL.S16 q0, d16, d6[1]

	91 VEXT.16 d16, d3, d4, #2

	92 VMLAL.S16 q0, d17, d7[1]

	93 VEXT.16 d17, d4, d5, #2

	94 VMLAL.S16 q0, d16, d6[2]

	95 VEXT.16 d16, d3, d4, #3

	96 VMLAL.S16 q0, d17, d7[2]

	97 VEXT.16 d17, d4, d5, #3

	98 VMLAL.S16 q0, d16, d6[3]

	99 VMLAL.S16 q0, d17, d7[3]

	100 BGT xcorr_kernel_neon_process8

	101 ; Process 4 samples here if we have > 4 left (still reading one extra y value).

	102 xcorr_kernel_neon_process4

	103 ADDS r12, r12, #4

	104 BLE xcorr_kernel_neon_process2

	105 ; Load x[0...3]

	106 VLD1.16 d6, [r4]!

	107 ; Use VAND since it's a data processing instruction again.

	108 VAND d4, d5, d5

	109 SUB r12, r12, #4

	110 ; Load y[4...7]

	111 VLD1.16 d5, [r5]!

	112 VMLAL.S16 q0, d4, d6[0]

	113 VEXT.16 d16, d4, d5, #1

	114 VMLAL.S16 q0, d16, d6[1]

	115 VEXT.16 d16, d4, d5, #2

	116 VMLAL.S16 q0, d16, d6[2]

	117 VEXT.16 d16, d4, d5, #3

	118 VMLAL.S16 q0, d16, d6[3]

	119 ; Process 2 samples here if we have > 2 left (still reading one extra y value).

	120 xcorr_kernel_neon_process2

	121 ADDS r12, r12, #2

	122 BLE xcorr_kernel_neon_process1

	123 ; Load x[0...1]

	124 VLD2.16 {d6[],d7[]}, [r4]!

	125 ; Use VAND since it's a data processing instruction again.

	126 VAND d4, d5, d5

	127 SUB r12, r12, #2

	128 ; Load y[4...5]

	129 VLD1.32 {d5[]}, [r5]!

	130 VMLAL.S16 q0, d4, d6

	131 VEXT.16 d16, d4, d5, #1

	132 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI

	133 ; instead of VEXT, since it's a data-processing instruction.

	134 VSRI.64 d5, d4, #32

	135 VMLAL.S16 q0, d16, d7

	136 ; Process 1 sample using the extra y value we loaded above.

	137 xcorr_kernel_neon_process1

	138 ; Load next *x

	139 VLD1.16 {d6[]}, [r4]!

	140 ADDS r12, r12, #1

	141 ; y[0...3] are left in d5 from prior iteration(s) (if any)

	142 VMLAL.S16 q0, d5, d6

	143 MOVLE pc, lr

	144 ; Now process 1 last sample, not reading ahead.

	145 ; Load last *y

	146 VLD1.16 {d4[]}, [r5]!

	147 VSRI.64 d4, d5, #16

	148 ; Load last *x

	149 VLD1.16 {d6[]}, [r4]!

	150 VMLAL.S16 q0, d4, d6

	151 MOV pc, lr

	152 ENDP

	153

	154 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 _x, opus_val16 _y,

	155 ; opus_val32 *xcorr, int len, int max_pitch)

	156 celt_pitch_xcorr_neon PROC

	157 ; input:

	158 ; r0 = opus_val16 *_x

	159 ; r1 = opus_val16 *_y

	160 ; r2 = opus_val32 *xcorr

	161 ; r3 = int len

	162 ; output:

	163 ; r0 = int maxcorr

	164 ; internal usage:

	165 ; r4 = opus_val16 *x (for xcorr_kernel_neon())

	166 ; r5 = opus_val16 *y (for xcorr_kernel_neon())

	167 ; r6 = int max_pitch

	168 ; r12 = int j

	169 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())

	170 STMFD sp!, {r4-r6, lr}

	171 LDR r6, [sp, #16]

	172 VMOV.S32 q15, #1

	173 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done

	174 SUBS r6, r6, #4

	175 BLT celt_pitch_xcorr_neon_process4_done

	176 celt_pitch_xcorr_neon_process4

	177 ; xcorr_kernel_neon parameters:

	178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}

	179 MOV r4, r0

	180 MOV r5, r1

	181 VEOR q0, q0, q0

	182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.

	183 ; So we don't save/restore any other registers.

	184 BL xcorr_kernel_neon

	185 SUBS r6, r6, #4

	186 VST1.32 {q0}, [r2]!

	187 ; _y += 4

	188 ADD r1, r1, #8

	189 VMAX.S32 q15, q15, q0

	190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done

	191 BGE celt_pitch_xcorr_neon_process4

	192 ; We have less than 4 sums left to compute.

	193 celt_pitch_xcorr_neon_process4_done

	194 ADDS r6, r6, #4

	195 ; Reduce maxcorr to a single value

	196 VMAX.S32 d30, d30, d31

	197 VPMAX.S32 d30, d30, d30

	198 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done

	199 BLE celt_pitch_xcorr_neon_done

	200 ; Now compute each remaining sum one at a time.

	201 celt_pitch_xcorr_neon_process_remaining

	202 MOV r4, r0

	203 MOV r5, r1

	204 VMOV.I32 q0, #0

	205 SUBS r12, r3, #8

	206 BLT celt_pitch_xcorr_neon_process_remaining4

	207 ; Sum terms 8 at a time.

	208 celt_pitch_xcorr_neon_process_remaining_loop8

	209 ; Load x[0...7]

	210 VLD1.16 {q1}, [r4]!

	211 ; Load y[0...7]

	212 VLD1.16 {q2}, [r5]!

	213 SUBS r12, r12, #8

	214 VMLAL.S16 q0, d4, d2

	215 VMLAL.S16 q0, d5, d3

	216 BGE celt_pitch_xcorr_neon_process_remaining_loop8

	217 ; Sum terms 4 at a time.

	218 celt_pitch_xcorr_neon_process_remaining4

	219 ADDS r12, r12, #4

	220 BLT celt_pitch_xcorr_neon_process_remaining4_done

	221 ; Load x[0...3]

	222 VLD1.16 {d2}, [r4]!

	223 ; Load y[0...3]

	224 VLD1.16 {d3}, [r5]!

	225 SUB r12, r12, #4

	226 VMLAL.S16 q0, d3, d2

	227 celt_pitch_xcorr_neon_process_remaining4_done

	228 ; Reduce the sum to a single value.

	229 VADD.S32 d0, d0, d1

	230 VPADDL.S32 d0, d0

	231 ADDS r12, r12, #4

	232 BLE celt_pitch_xcorr_neon_process_remaining_loop_done

	233 ; Sum terms 1 at a time.

	234 celt_pitch_xcorr_neon_process_remaining_loop1

	235 VLD1.16 {d2[]}, [r4]!

	236 VLD1.16 {d3[]}, [r5]!

	237 SUBS r12, r12, #1

	238 VMLAL.S16 q0, d2, d3

	239 BGT celt_pitch_xcorr_neon_process_remaining_loop1

	240 celt_pitch_xcorr_neon_process_remaining_loop_done

	241 VST1.32 {d0[0]}, [r2]!

	242 VMAX.S32 d30, d30, d0

	243 SUBS r6, r6, #1

	244 ; _y++

	245 ADD r1, r1, #2

	246 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining

	247 BGT celt_pitch_xcorr_neon_process_remaining

	248 celt_pitch_xcorr_neon_done

	249 VMOV.32 r0, d30[0]

	250 LDMFD sp!, {r4-r6, pc}

	251 ENDP

	252

	253 ENDIF

	254

	255 IF OPUS_ARM_MAY_HAVE_EDSP

	256

	257 ; This will get used on ARMv7 devices without NEON, so it has been optimized

	258 ; to take advantage of dual-issuing where possible.

	259 xcorr_kernel_edsp PROC

	260 ; input:

	261 ; r3 = int len

	262 ; r4 = opus_val16 *_x (must be 32-bit aligned)

	263 ; r5 = opus_val16 *_y (must be 32-bit aligned)

	264 ; r6...r9 = opus_val32 sum[4]

	265 ; output:

	266 ; r6...r9 = opus_val32 sum[4]

	267 ; preserved: r0-r5

	268 ; internal usage

	269 ; r2 = int j

	270 ; r12,r14 = opus_val16 x[4]

	271 ; r10,r11 = opus_val16 y[4]

	272 STMFD sp!, {r2,r4,r5,lr}

	273 LDR r10, [r5], #4 ; Load y[0...1]

	274 SUBS r2, r3, #4 ; j = len-4

	275 LDR r11, [r5], #4 ; Load y[2...3]

	276 BLE xcorr_kernel_edsp_process4_done

	277 LDR r12, [r4], #4 ; Load x[0...1]

	278 ; Stall

	279 xcorr_kernel_edsp_process4

	280 ; The multiplies must issue from pipeline 0, and can't dual-issue with each

	281 ; other. Every other instruction here dual-issues with a multiply, and is

	282 ; thus "free". There should be no stalls in the body of the loop.

	283 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)

	284 LDR r14, [r4], #4 ; Load x[2...3]

	285 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)

	286 SUBS r2, r2, #4 ; j-=4

	287 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)

	288 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)

	289 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)

	290 LDR r10, [r5], #4 ; Load y[4...5]

	291 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)

	292 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)

	293 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)

	294 LDRGT r12, [r4], #4 ; Load x[0...1]

	295 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)

	296 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)

	297 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)

	298 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)

	299 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)

	300 LDR r11, [r5], #4 ; Load y[6...7]

	301 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)

	302 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)

	303 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)

	304 BGT xcorr_kernel_edsp_process4

	305 xcorr_kernel_edsp_process4_done

	306 ADDS r2, r2, #4

	307 BLE xcorr_kernel_edsp_done

	308 LDRH r12, [r4], #2 ; r12 = *x++

	309 SUBS r2, r2, #1 ; j--

	310 ; Stall

	311 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)

	312 LDRGTH r14, [r4], #2 ; r14 = *x++

	313 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)

	314 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)

	315 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)

	316 BLE xcorr_kernel_edsp_done

	317 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)

	318 SUBS r2, r2, #1 ; j--

	319 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)

	320 LDRH r10, [r5], #2 ; r10 = y_4 = *y++

	321 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)

	322 LDRGTH r12, [r4], #2 ; r12 = *x++

	323 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)

	324 BLE xcorr_kernel_edsp_done

	325 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)

	326 CMP r2, #1 ; j--

	327 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)

	328 LDRH r2, [r5], #2 ; r2 = y_5 = *y++

	329 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)

	330 LDRGTH r14, [r4] ; r14 = *x

	331 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)

	332 BLE xcorr_kernel_edsp_done

	333 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)

	334 LDRH r11, [r5] ; r11 = y_6 = *y

	335 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)

	336 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)

	337 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)

	338 xcorr_kernel_edsp_done

	339 LDMFD sp!, {r2,r4,r5,pc}

	340 ENDP

	341

	342 celt_pitch_xcorr_edsp PROC

	343 ; input:

	344 ; r0 = opus_val16 *_x (must be 32-bit aligned)

	345 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned)

	346 ; r2 = opus_val32 *xcorr

	347 ; r3 = int len

	348 ; output:

	349 ; r0 = maxcorr

	350 ; internal usage

	351 ; r4 = opus_val16 *x

	352 ; r5 = opus_val16 *y

	353 ; r6 = opus_val32 sum0

	354 ; r7 = opus_val32 sum1

	355 ; r8 = opus_val32 sum2

	356 ; r9 = opus_val32 sum3

	357 ; r1 = int max_pitch

	358 ; r12 = int j

	359 STMFD sp!, {r4-r11, lr}

	360 MOV r5, r1

	361 LDR r1, [sp, #36]

	362 MOV r4, r0

	363 TST r5, #3

	364 ; maxcorr = 1

	365 MOV r0, #1

	366 BEQ celt_pitch_xcorr_edsp_process1u_done

	367 ; Compute one sum at the start to make y 32-bit aligned.

	368 SUBS r12, r3, #4

	369 ; r14 = sum = 0

	370 MOV r14, #0

	371 LDRH r8, [r5], #2

	372 BLE celt_pitch_xcorr_edsp_process1u_loop4_done

	373 LDR r6, [r4], #4

	374 MOV r8, r8, LSL #16

	375 celt_pitch_xcorr_edsp_process1u_loop4

	376 LDR r9, [r5], #4

	377 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)

	378 LDR r7, [r4], #4

	379 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)

	380 LDR r8, [r5], #4

	381 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)

	382 SUBS r12, r12, #4 ; j-=4

	383 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)

	384 LDRGT r6, [r4], #4

	385 BGT celt_pitch_xcorr_edsp_process1u_loop4

	386 MOV r8, r8, LSR #16

	387 celt_pitch_xcorr_edsp_process1u_loop4_done

	388 ADDS r12, r12, #4

	389 celt_pitch_xcorr_edsp_process1u_loop1

	390 LDRGEH r6, [r4], #2

	391 ; Stall

	392 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x, y)

	393 SUBGES r12, r12, #1

	394 LDRGTH r8, [r5], #2

	395 BGT celt_pitch_xcorr_edsp_process1u_loop1

	396 ; Restore _x

	397 SUB r4, r4, r3, LSL #1

	398 ; Restore and advance _y

	399 SUB r5, r5, r3, LSL #1

	400 ; maxcorr = max(maxcorr, sum)

	401 CMP r0, r14

	402 ADD r5, r5, #2

	403 MOVLT r0, r14

	404 SUBS r1, r1, #1

	405 ; xcorr[i] = sum

	406 STR r14, [r2], #4

	407 BLE celt_pitch_xcorr_edsp_done

	408 celt_pitch_xcorr_edsp_process1u_done

	409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2

	410 SUBS r1, r1, #4

	411 BLT celt_pitch_xcorr_edsp_process2

	412 celt_pitch_xcorr_edsp_process4

	413 ; xcorr_kernel_edsp parameters:

	414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}

	415 MOV r6, #0

	416 MOV r7, #0

	417 MOV r8, #0

	418 MOV r9, #0

	419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)

	420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)

	421 CMP r0, r6

	422 ; _y+=4

	423 ADD r5, r5, #8

	424 MOVLT r0, r6

	425 CMP r0, r7

	426 MOVLT r0, r7

	427 CMP r0, r8

	428 MOVLT r0, r8

	429 CMP r0, r9

	430 MOVLT r0, r9

	431 STMIA r2!, {r6-r9}

	432 SUBS r1, r1, #4

	433 BGE celt_pitch_xcorr_edsp_process4

	434 celt_pitch_xcorr_edsp_process2

	435 ADDS r1, r1, #2

	436 BLT celt_pitch_xcorr_edsp_process1a

	437 SUBS r12, r3, #4

	438 ; {r10, r11} = {sum0, sum1} = {0, 0}

	439 MOV r10, #0

	440 MOV r11, #0

	441 LDR r8, [r5], #4

	442 BLE celt_pitch_xcorr_edsp_process2_loop_done

	443 LDR r6, [r4], #4

	444 LDR r9, [r5], #4

	445 celt_pitch_xcorr_edsp_process2_loop4

	446 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)

	447 LDR r7, [r4], #4

	448 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)

	449 SUBS r12, r12, #4 ; j-=4

	450 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)

	451 LDR r8, [r5], #4

	452 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)

	453 LDRGT r6, [r4], #4

	454 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)

	455 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)

	456 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)

	457 LDRGT r9, [r5], #4

	458 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)

	459 BGT celt_pitch_xcorr_edsp_process2_loop4

	460 celt_pitch_xcorr_edsp_process2_loop_done

	461 ADDS r12, r12, #2

	462 BLE celt_pitch_xcorr_edsp_process2_1

	463 LDR r6, [r4], #4

	464 ; Stall

	465 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)

	466 LDR r9, [r5], #4

	467 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)

	468 SUB r12, r12, #2

	469 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)

	470 MOV r8, r9

	471 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)

	472 celt_pitch_xcorr_edsp_process2_1

	473 LDRH r6, [r4], #2

	474 ADDS r12, r12, #1

	475 ; Stall

	476 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)

	477 LDRGTH r7, [r4], #2

	478 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)

	479 BLE celt_pitch_xcorr_edsp_process2_done

	480 LDRH r9, [r5], #2

	481 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)

	482 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)

	483 celt_pitch_xcorr_edsp_process2_done

	484 ; Restore _x

	485 SUB r4, r4, r3, LSL #1

	486 ; Restore and advance _y

	487 SUB r5, r5, r3, LSL #1

	488 ; maxcorr = max(maxcorr, sum0)

	489 CMP r0, r10

	490 ADD r5, r5, #2

	491 MOVLT r0, r10

	492 SUB r1, r1, #2

	493 ; maxcorr = max(maxcorr, sum1)

	494 CMP r0, r11

	495 ; xcorr[i] = sum

	496 STR r10, [r2], #4

	497 MOVLT r0, r11

	498 STR r11, [r2], #4

	499 celt_pitch_xcorr_edsp_process1a

	500 ADDS r1, r1, #1

	501 BLT celt_pitch_xcorr_edsp_done

	502 SUBS r12, r3, #4

	503 ; r14 = sum = 0

	504 MOV r14, #0

	505 BLT celt_pitch_xcorr_edsp_process1a_loop_done

	506 LDR r6, [r4], #4

	507 LDR r8, [r5], #4

	508 LDR r7, [r4], #4

	509 LDR r9, [r5], #4

	510 celt_pitch_xcorr_edsp_process1a_loop4

	511 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)

	512 SUBS r12, r12, #4 ; j-=4

	513 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)

	514 LDRGE r6, [r4], #4

	515 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)

	516 LDRGE r8, [r5], #4

	517 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)

	518 LDRGE r7, [r4], #4

	519 LDRGE r9, [r5], #4

	520 BGE celt_pitch_xcorr_edsp_process1a_loop4

	521 celt_pitch_xcorr_edsp_process1a_loop_done

	522 ADDS r12, r12, #2

	523 LDRGE r6, [r4], #4

	524 LDRGE r8, [r5], #4

	525 ; Stall

	526 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)

	527 SUBGE r12, r12, #2

	528 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)

	529 ADDS r12, r12, #1

	530 LDRGEH r6, [r4], #2

	531 LDRGEH r8, [r5], #2

	532 ; Stall

	533 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x, y)

	534 ; maxcorr = max(maxcorr, sum)

	535 CMP r0, r14

	536 ; xcorr[i] = sum

	537 STR r14, [r2], #4

	538 MOVLT r0, r14

	539 celt_pitch_xcorr_edsp_done

	540 LDMFD sp!, {r4-r11, pc}

	541 ENDP

	542

	543 ENDIF

	544

	545 END

OLD	NEW

« no previous file with comments | « celt/arm/armopts.s.in ('k') | celt/arm/fixed_armv4.h » ('j') | no next file with comments »