| Index: celt/arm/celt_pitch_xcorr_arm.s
 | 
| diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s
 | 
| new file mode 100644
 | 
| index 0000000000000000000000000000000000000000..09917b16bf25fff2b169fa3654597c39caf70c89
 | 
| --- /dev/null
 | 
| +++ b/celt/arm/celt_pitch_xcorr_arm.s
 | 
| @@ -0,0 +1,545 @@
 | 
| +; Copyright (c) 2007-2008 CSIRO
 | 
| +; Copyright (c) 2007-2009 Xiph.Org Foundation
 | 
| +; Copyright (c) 2013      Parrot
 | 
| +; Written by Aurélien Zanelli
 | 
| +;
 | 
| +; Redistribution and use in source and binary forms, with or without
 | 
| +; modification, are permitted provided that the following conditions
 | 
| +; are met:
 | 
| +;
 | 
| +; - Redistributions of source code must retain the above copyright
 | 
| +; notice, this list of conditions and the following disclaimer.
 | 
| +;
 | 
| +; - Redistributions in binary form must reproduce the above copyright
 | 
| +; notice, this list of conditions and the following disclaimer in the
 | 
| +; documentation and/or other materials provided with the distribution.
 | 
| +;
 | 
| +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
| +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
| +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
| +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 | 
| +; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 | 
| +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 | 
| +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | 
| +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 | 
| +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 | 
| +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 | 
| +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
| +
 | 
| +  AREA  |.text|, CODE, READONLY
 | 
| +
 | 
| +  GET    celt/arm/armopts.s
 | 
| +
 | 
| +IF OPUS_ARM_MAY_HAVE_EDSP
 | 
| +  EXPORT celt_pitch_xcorr_edsp
 | 
| +ENDIF
 | 
| +
 | 
| +IF OPUS_ARM_MAY_HAVE_NEON
 | 
| +  EXPORT celt_pitch_xcorr_neon
 | 
| +ENDIF
 | 
| +
 | 
| +IF OPUS_ARM_MAY_HAVE_NEON
 | 
| +
 | 
| +; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
 | 
| +xcorr_kernel_neon PROC
 | 
| +  ; input:
 | 
| +  ;   r3     = int         len
 | 
| +  ;   r4     = opus_val16 *x
 | 
| +  ;   r5     = opus_val16 *y
 | 
| +  ;   q0     = opus_val32  sum[4]
 | 
| +  ; output:
 | 
| +  ;   q0     = opus_val32  sum[4]
 | 
| +  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
 | 
| +  ; internal usage:
 | 
| +  ;   r12 = int j
 | 
| +  ;   d3  = y_3|y_2|y_1|y_0
 | 
| +  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
 | 
| +  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
 | 
| +  ;   q8  = scratch
 | 
| +  ;
 | 
| +  ; Load y[0...3]
 | 
| +  ; This requires len>0 to always be valid (which we assert in the C code).
 | 
| +  VLD1.16      {d5}, [r5]!
 | 
| +  SUBS         r12, r3, #8
 | 
| +  BLE xcorr_kernel_neon_process4
 | 
| +; Process 8 samples at a time.
 | 
| +; This loop loads one y value more than we actually need. Therefore we have to
 | 
| +; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
 | 
| +; reading past the end of the array.
 | 
| +xcorr_kernel_neon_process8
 | 
| +  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
 | 
| +  ; - 2 cycles of ARM insrtuctions,
 | 
| +  ; - 10 cycles of load/store/byte permute instructions, and
 | 
| +  ; - 9 cycles of data processing instructions.
 | 
| +  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
 | 
| +  ; latter two categories, meaning the whole loop should run in 10 cycles per
 | 
| +  ; iteration, barring cache misses.
 | 
| +  ;
 | 
| +  ; Load x[0...7]
 | 
| +  VLD1.16      {d6, d7}, [r4]!
 | 
| +  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
 | 
| +  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
 | 
| +  VAND         d3, d5, d5
 | 
| +  SUBS         r12, r12, #8
 | 
| +  ; Load y[4...11]
 | 
| +  VLD1.16      {d4, d5}, [r5]!
 | 
| +  VMLAL.S16    q0, d3, d6[0]
 | 
| +  VEXT.16      d16, d3, d4, #1
 | 
| +  VMLAL.S16    q0, d4, d7[0]
 | 
| +  VEXT.16      d17, d4, d5, #1
 | 
| +  VMLAL.S16    q0, d16, d6[1]
 | 
| +  VEXT.16      d16, d3, d4, #2
 | 
| +  VMLAL.S16    q0, d17, d7[1]
 | 
| +  VEXT.16      d17, d4, d5, #2
 | 
| +  VMLAL.S16    q0, d16, d6[2]
 | 
| +  VEXT.16      d16, d3, d4, #3
 | 
| +  VMLAL.S16    q0, d17, d7[2]
 | 
| +  VEXT.16      d17, d4, d5, #3
 | 
| +  VMLAL.S16    q0, d16, d6[3]
 | 
| +  VMLAL.S16    q0, d17, d7[3]
 | 
| +  BGT xcorr_kernel_neon_process8
 | 
| +; Process 4 samples here if we have > 4 left (still reading one extra y value).
 | 
| +xcorr_kernel_neon_process4
 | 
| +  ADDS         r12, r12, #4
 | 
| +  BLE xcorr_kernel_neon_process2
 | 
| +  ; Load x[0...3]
 | 
| +  VLD1.16      d6, [r4]!
 | 
| +  ; Use VAND since it's a data processing instruction again.
 | 
| +  VAND         d4, d5, d5
 | 
| +  SUB          r12, r12, #4
 | 
| +  ; Load y[4...7]
 | 
| +  VLD1.16      d5, [r5]!
 | 
| +  VMLAL.S16    q0, d4, d6[0]
 | 
| +  VEXT.16      d16, d4, d5, #1
 | 
| +  VMLAL.S16    q0, d16, d6[1]
 | 
| +  VEXT.16      d16, d4, d5, #2
 | 
| +  VMLAL.S16    q0, d16, d6[2]
 | 
| +  VEXT.16      d16, d4, d5, #3
 | 
| +  VMLAL.S16    q0, d16, d6[3]
 | 
| +; Process 2 samples here if we have > 2 left (still reading one extra y value).
 | 
| +xcorr_kernel_neon_process2
 | 
| +  ADDS         r12, r12, #2
 | 
| +  BLE xcorr_kernel_neon_process1
 | 
| +  ; Load x[0...1]
 | 
| +  VLD2.16      {d6[],d7[]}, [r4]!
 | 
| +  ; Use VAND since it's a data processing instruction again.
 | 
| +  VAND         d4, d5, d5
 | 
| +  SUB          r12, r12, #2
 | 
| +  ; Load y[4...5]
 | 
| +  VLD1.32      {d5[]}, [r5]!
 | 
| +  VMLAL.S16    q0, d4, d6
 | 
| +  VEXT.16      d16, d4, d5, #1
 | 
| +  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
 | 
| +  ; instead of VEXT, since it's a data-processing instruction.
 | 
| +  VSRI.64      d5, d4, #32
 | 
| +  VMLAL.S16    q0, d16, d7
 | 
| +; Process 1 sample using the extra y value we loaded above.
 | 
| +xcorr_kernel_neon_process1
 | 
| +  ; Load next *x
 | 
| +  VLD1.16      {d6[]}, [r4]!
 | 
| +  ADDS         r12, r12, #1
 | 
| +  ; y[0...3] are left in d5 from prior iteration(s) (if any)
 | 
| +  VMLAL.S16    q0, d5, d6
 | 
| +  MOVLE        pc, lr
 | 
| +; Now process 1 last sample, not reading ahead.
 | 
| +  ; Load last *y
 | 
| +  VLD1.16      {d4[]}, [r5]!
 | 
| +  VSRI.64      d4, d5, #16
 | 
| +  ; Load last *x
 | 
| +  VLD1.16      {d6[]}, [r4]!
 | 
| +  VMLAL.S16    q0, d4, d6
 | 
| +  MOV          pc, lr
 | 
| +  ENDP
 | 
| +
 | 
| +; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
 | 
| +;  opus_val32 *xcorr, int len, int max_pitch)
 | 
| +celt_pitch_xcorr_neon PROC
 | 
| +  ; input:
 | 
| +  ;   r0  = opus_val16 *_x
 | 
| +  ;   r1  = opus_val16 *_y
 | 
| +  ;   r2  = opus_val32 *xcorr
 | 
| +  ;   r3  = int         len
 | 
| +  ; output:
 | 
| +  ;   r0  = int         maxcorr
 | 
| +  ; internal usage:
 | 
| +  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
 | 
| +  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
 | 
| +  ;   r6  = int         max_pitch
 | 
| +  ;   r12 = int         j
 | 
| +  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
 | 
| +  STMFD        sp!, {r4-r6, lr}
 | 
| +  LDR          r6, [sp, #16]
 | 
| +  VMOV.S32     q15, #1
 | 
| +  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
 | 
| +  SUBS         r6, r6, #4
 | 
| +  BLT celt_pitch_xcorr_neon_process4_done
 | 
| +celt_pitch_xcorr_neon_process4
 | 
| +  ; xcorr_kernel_neon parameters:
 | 
| +  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
 | 
| +  MOV          r4, r0
 | 
| +  MOV          r5, r1
 | 
| +  VEOR         q0, q0, q0
 | 
| +  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
 | 
| +  ; So we don't save/restore any other registers.
 | 
| +  BL xcorr_kernel_neon
 | 
| +  SUBS         r6, r6, #4
 | 
| +  VST1.32      {q0}, [r2]!
 | 
| +  ; _y += 4
 | 
| +  ADD          r1, r1, #8
 | 
| +  VMAX.S32     q15, q15, q0
 | 
| +  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
 | 
| +  BGE celt_pitch_xcorr_neon_process4
 | 
| +; We have less than 4 sums left to compute.
 | 
| +celt_pitch_xcorr_neon_process4_done
 | 
| +  ADDS         r6, r6, #4
 | 
| +  ; Reduce maxcorr to a single value
 | 
| +  VMAX.S32     d30, d30, d31
 | 
| +  VPMAX.S32    d30, d30, d30
 | 
| +  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
 | 
| +  BLE celt_pitch_xcorr_neon_done
 | 
| +; Now compute each remaining sum one at a time.
 | 
| +celt_pitch_xcorr_neon_process_remaining
 | 
| +  MOV          r4, r0
 | 
| +  MOV          r5, r1
 | 
| +  VMOV.I32     q0, #0
 | 
| +  SUBS         r12, r3, #8
 | 
| +  BLT celt_pitch_xcorr_neon_process_remaining4
 | 
| +; Sum terms 8 at a time.
 | 
| +celt_pitch_xcorr_neon_process_remaining_loop8
 | 
| +  ; Load x[0...7]
 | 
| +  VLD1.16      {q1}, [r4]!
 | 
| +  ; Load y[0...7]
 | 
| +  VLD1.16      {q2}, [r5]!
 | 
| +  SUBS         r12, r12, #8
 | 
| +  VMLAL.S16    q0, d4, d2
 | 
| +  VMLAL.S16    q0, d5, d3
 | 
| +  BGE celt_pitch_xcorr_neon_process_remaining_loop8
 | 
| +; Sum terms 4 at a time.
 | 
| +celt_pitch_xcorr_neon_process_remaining4
 | 
| +  ADDS         r12, r12, #4
 | 
| +  BLT celt_pitch_xcorr_neon_process_remaining4_done
 | 
| +  ; Load x[0...3]
 | 
| +  VLD1.16      {d2}, [r4]!
 | 
| +  ; Load y[0...3]
 | 
| +  VLD1.16      {d3}, [r5]!
 | 
| +  SUB          r12, r12, #4
 | 
| +  VMLAL.S16    q0, d3, d2
 | 
| +celt_pitch_xcorr_neon_process_remaining4_done
 | 
| +  ; Reduce the sum to a single value.
 | 
| +  VADD.S32     d0, d0, d1
 | 
| +  VPADDL.S32   d0, d0
 | 
| +  ADDS         r12, r12, #4
 | 
| +  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
 | 
| +; Sum terms 1 at a time.
 | 
| +celt_pitch_xcorr_neon_process_remaining_loop1
 | 
| +  VLD1.16      {d2[]}, [r4]!
 | 
| +  VLD1.16      {d3[]}, [r5]!
 | 
| +  SUBS         r12, r12, #1
 | 
| +  VMLAL.S16    q0, d2, d3
 | 
| +  BGT celt_pitch_xcorr_neon_process_remaining_loop1
 | 
| +celt_pitch_xcorr_neon_process_remaining_loop_done
 | 
| +  VST1.32      {d0[0]}, [r2]!
 | 
| +  VMAX.S32     d30, d30, d0
 | 
| +  SUBS         r6, r6, #1
 | 
| +  ; _y++
 | 
| +  ADD          r1, r1, #2
 | 
| +  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
 | 
| +  BGT celt_pitch_xcorr_neon_process_remaining
 | 
| +celt_pitch_xcorr_neon_done
 | 
| +  VMOV.32      r0, d30[0]
 | 
| +  LDMFD        sp!, {r4-r6, pc}
 | 
| +  ENDP
 | 
| +
 | 
| +ENDIF
 | 
| +
 | 
| +IF OPUS_ARM_MAY_HAVE_EDSP
 | 
| +
 | 
| +; This will get used on ARMv7 devices without NEON, so it has been optimized
 | 
| +; to take advantage of dual-issuing where possible.
 | 
| +xcorr_kernel_edsp PROC
 | 
| +  ; input:
 | 
| +  ;   r3      = int         len
 | 
| +  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
 | 
| +  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
 | 
| +  ;   r6...r9 = opus_val32  sum[4]
 | 
| +  ; output:
 | 
| +  ;   r6...r9 = opus_val32  sum[4]
 | 
| +  ; preserved: r0-r5
 | 
| +  ; internal usage
 | 
| +  ;   r2      = int         j
 | 
| +  ;   r12,r14 = opus_val16  x[4]
 | 
| +  ;   r10,r11 = opus_val16  y[4]
 | 
| +  STMFD        sp!, {r2,r4,r5,lr}
 | 
| +  LDR          r10, [r5], #4      ; Load y[0...1]
 | 
| +  SUBS         r2, r3, #4         ; j = len-4
 | 
| +  LDR          r11, [r5], #4      ; Load y[2...3]
 | 
| +  BLE xcorr_kernel_edsp_process4_done
 | 
| +  LDR          r12, [r4], #4      ; Load x[0...1]
 | 
| +  ; Stall
 | 
| +xcorr_kernel_edsp_process4
 | 
| +  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
 | 
| +  ; other. Every other instruction here dual-issues with a multiply, and is
 | 
| +  ; thus "free". There should be no stalls in the body of the loop.
 | 
| +  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
 | 
| +  LDR          r14, [r4], #4      ; Load x[2...3]
 | 
| +  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
 | 
| +  SUBS         r2, r2, #4         ; j-=4
 | 
| +  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
 | 
| +  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
 | 
| +  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
 | 
| +  LDR          r10, [r5], #4      ; Load y[4...5]
 | 
| +  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
 | 
| +  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
 | 
| +  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
 | 
| +  LDRGT        r12, [r4], #4      ; Load x[0...1]
 | 
| +  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
 | 
| +  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
 | 
| +  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
 | 
| +  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
 | 
| +  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
 | 
| +  LDR          r11, [r5], #4      ; Load y[6...7]
 | 
| +  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
 | 
| +  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
 | 
| +  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
 | 
| +  BGT xcorr_kernel_edsp_process4
 | 
| +xcorr_kernel_edsp_process4_done
 | 
| +  ADDS         r2, r2, #4
 | 
| +  BLE xcorr_kernel_edsp_done
 | 
| +  LDRH         r12, [r4], #2      ; r12 = *x++
 | 
| +  SUBS         r2, r2, #1         ; j--
 | 
| +  ; Stall
 | 
| +  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
 | 
| +  LDRGTH       r14, [r4], #2      ; r14 = *x++
 | 
| +  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
 | 
| +  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
 | 
| +  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
 | 
| +  BLE xcorr_kernel_edsp_done
 | 
| +  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
 | 
| +  SUBS         r2, r2, #1         ; j--
 | 
| +  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
 | 
| +  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
 | 
| +  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
 | 
| +  LDRGTH       r12, [r4], #2      ; r12 = *x++
 | 
| +  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
 | 
| +  BLE xcorr_kernel_edsp_done
 | 
| +  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
 | 
| +  CMP          r2, #1             ; j--
 | 
| +  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
 | 
| +  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
 | 
| +  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
 | 
| +  LDRGTH       r14, [r4]          ; r14 = *x
 | 
| +  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
 | 
| +  BLE xcorr_kernel_edsp_done
 | 
| +  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
 | 
| +  LDRH         r11, [r5]          ; r11 = y_6 = *y
 | 
| +  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
 | 
| +  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
 | 
| +  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
 | 
| +xcorr_kernel_edsp_done
 | 
| +  LDMFD        sp!, {r2,r4,r5,pc}
 | 
| +  ENDP
 | 
| +
 | 
| +celt_pitch_xcorr_edsp PROC
 | 
| +  ; input:
 | 
| +  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
 | 
| +  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
 | 
| +  ;   r2  = opus_val32 *xcorr
 | 
| +  ;   r3  = int         len
 | 
| +  ; output:
 | 
| +  ;   r0  = maxcorr
 | 
| +  ; internal usage
 | 
| +  ;   r4  = opus_val16 *x
 | 
| +  ;   r5  = opus_val16 *y
 | 
| +  ;   r6  = opus_val32  sum0
 | 
| +  ;   r7  = opus_val32  sum1
 | 
| +  ;   r8  = opus_val32  sum2
 | 
| +  ;   r9  = opus_val32  sum3
 | 
| +  ;   r1  = int         max_pitch
 | 
| +  ;   r12 = int         j
 | 
| +  STMFD        sp!, {r4-r11, lr}
 | 
| +  MOV          r5, r1
 | 
| +  LDR          r1, [sp, #36]
 | 
| +  MOV          r4, r0
 | 
| +  TST          r5, #3
 | 
| +  ; maxcorr = 1
 | 
| +  MOV          r0, #1
 | 
| +  BEQ          celt_pitch_xcorr_edsp_process1u_done
 | 
| +; Compute one sum at the start to make y 32-bit aligned.
 | 
| +  SUBS         r12, r3, #4
 | 
| +  ; r14 = sum = 0
 | 
| +  MOV          r14, #0
 | 
| +  LDRH         r8, [r5], #2
 | 
| +  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
 | 
| +  LDR          r6, [r4], #4
 | 
| +  MOV          r8, r8, LSL #16
 | 
| +celt_pitch_xcorr_edsp_process1u_loop4
 | 
| +  LDR          r9, [r5], #4
 | 
| +  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
 | 
| +  LDR          r7, [r4], #4
 | 
| +  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
 | 
| +  LDR          r8, [r5], #4
 | 
| +  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
 | 
| +  SUBS         r12, r12, #4         ; j-=4
 | 
| +  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
 | 
| +  LDRGT        r6, [r4], #4
 | 
| +  BGT celt_pitch_xcorr_edsp_process1u_loop4
 | 
| +  MOV          r8, r8, LSR #16
 | 
| +celt_pitch_xcorr_edsp_process1u_loop4_done
 | 
| +  ADDS         r12, r12, #4
 | 
| +celt_pitch_xcorr_edsp_process1u_loop1
 | 
| +  LDRGEH       r6, [r4], #2
 | 
| +  ; Stall
 | 
| +  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
 | 
| +  SUBGES       r12, r12, #1
 | 
| +  LDRGTH       r8, [r5], #2
 | 
| +  BGT celt_pitch_xcorr_edsp_process1u_loop1
 | 
| +  ; Restore _x
 | 
| +  SUB          r4, r4, r3, LSL #1
 | 
| +  ; Restore and advance _y
 | 
| +  SUB          r5, r5, r3, LSL #1
 | 
| +  ; maxcorr = max(maxcorr, sum)
 | 
| +  CMP          r0, r14
 | 
| +  ADD          r5, r5, #2
 | 
| +  MOVLT        r0, r14
 | 
| +  SUBS         r1, r1, #1
 | 
| +  ; xcorr[i] = sum
 | 
| +  STR          r14, [r2], #4
 | 
| +  BLE celt_pitch_xcorr_edsp_done
 | 
| +celt_pitch_xcorr_edsp_process1u_done
 | 
| +  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
 | 
| +  SUBS         r1, r1, #4
 | 
| +  BLT celt_pitch_xcorr_edsp_process2
 | 
| +celt_pitch_xcorr_edsp_process4
 | 
| +  ; xcorr_kernel_edsp parameters:
 | 
| +  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
 | 
| +  MOV          r6, #0
 | 
| +  MOV          r7, #0
 | 
| +  MOV          r8, #0
 | 
| +  MOV          r9, #0
 | 
| +  BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
 | 
| +  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
 | 
| +  CMP          r0, r6
 | 
| +  ; _y+=4
 | 
| +  ADD          r5, r5, #8
 | 
| +  MOVLT        r0, r6
 | 
| +  CMP          r0, r7
 | 
| +  MOVLT        r0, r7
 | 
| +  CMP          r0, r8
 | 
| +  MOVLT        r0, r8
 | 
| +  CMP          r0, r9
 | 
| +  MOVLT        r0, r9
 | 
| +  STMIA        r2!, {r6-r9}
 | 
| +  SUBS         r1, r1, #4
 | 
| +  BGE celt_pitch_xcorr_edsp_process4
 | 
| +celt_pitch_xcorr_edsp_process2
 | 
| +  ADDS         r1, r1, #2
 | 
| +  BLT celt_pitch_xcorr_edsp_process1a
 | 
| +  SUBS         r12, r3, #4
 | 
| +  ; {r10, r11} = {sum0, sum1} = {0, 0}
 | 
| +  MOV          r10, #0
 | 
| +  MOV          r11, #0
 | 
| +  LDR          r8, [r5], #4
 | 
| +  BLE celt_pitch_xcorr_edsp_process2_loop_done
 | 
| +  LDR          r6, [r4], #4
 | 
| +  LDR          r9, [r5], #4
 | 
| +celt_pitch_xcorr_edsp_process2_loop4
 | 
| +  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
 | 
| +  LDR          r7, [r4], #4
 | 
| +  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
 | 
| +  SUBS         r12, r12, #4         ; j-=4
 | 
| +  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
 | 
| +  LDR          r8, [r5], #4
 | 
| +  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
 | 
| +  LDRGT        r6, [r4], #4
 | 
| +  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
 | 
| +  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
 | 
| +  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
 | 
| +  LDRGT        r9, [r5], #4
 | 
| +  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
 | 
| +  BGT celt_pitch_xcorr_edsp_process2_loop4
 | 
| +celt_pitch_xcorr_edsp_process2_loop_done
 | 
| +  ADDS         r12, r12, #2
 | 
| +  BLE  celt_pitch_xcorr_edsp_process2_1
 | 
| +  LDR          r6, [r4], #4
 | 
| +  ; Stall
 | 
| +  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
 | 
| +  LDR          r9, [r5], #4
 | 
| +  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
 | 
| +  SUB          r12, r12, #2
 | 
| +  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
 | 
| +  MOV          r8, r9
 | 
| +  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
 | 
| +celt_pitch_xcorr_edsp_process2_1
 | 
| +  LDRH         r6, [r4], #2
 | 
| +  ADDS         r12, r12, #1
 | 
| +  ; Stall
 | 
| +  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
 | 
| +  LDRGTH       r7, [r4], #2
 | 
| +  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
 | 
| +  BLE celt_pitch_xcorr_edsp_process2_done
 | 
| +  LDRH         r9, [r5], #2
 | 
| +  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
 | 
| +  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
 | 
| +celt_pitch_xcorr_edsp_process2_done
 | 
| +  ; Restore _x
 | 
| +  SUB          r4, r4, r3, LSL #1
 | 
| +  ; Restore and advance _y
 | 
| +  SUB          r5, r5, r3, LSL #1
 | 
| +  ; maxcorr = max(maxcorr, sum0)
 | 
| +  CMP          r0, r10
 | 
| +  ADD          r5, r5, #2
 | 
| +  MOVLT        r0, r10
 | 
| +  SUB          r1, r1, #2
 | 
| +  ; maxcorr = max(maxcorr, sum1)
 | 
| +  CMP          r0, r11
 | 
| +  ; xcorr[i] = sum
 | 
| +  STR          r10, [r2], #4
 | 
| +  MOVLT        r0, r11
 | 
| +  STR          r11, [r2], #4
 | 
| +celt_pitch_xcorr_edsp_process1a
 | 
| +  ADDS         r1, r1, #1
 | 
| +  BLT celt_pitch_xcorr_edsp_done
 | 
| +  SUBS         r12, r3, #4
 | 
| +  ; r14 = sum = 0
 | 
| +  MOV          r14, #0
 | 
| +  BLT celt_pitch_xcorr_edsp_process1a_loop_done
 | 
| +  LDR          r6, [r4], #4
 | 
| +  LDR          r8, [r5], #4
 | 
| +  LDR          r7, [r4], #4
 | 
| +  LDR          r9, [r5], #4
 | 
| +celt_pitch_xcorr_edsp_process1a_loop4
 | 
| +  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
 | 
| +  SUBS         r12, r12, #4         ; j-=4
 | 
| +  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
 | 
| +  LDRGE        r6, [r4], #4
 | 
| +  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
 | 
| +  LDRGE        r8, [r5], #4
 | 
| +  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
 | 
| +  LDRGE        r7, [r4], #4
 | 
| +  LDRGE        r9, [r5], #4
 | 
| +  BGE celt_pitch_xcorr_edsp_process1a_loop4
 | 
| +celt_pitch_xcorr_edsp_process1a_loop_done
 | 
| +  ADDS         r12, r12, #2
 | 
| +  LDRGE        r6, [r4], #4
 | 
| +  LDRGE        r8, [r5], #4
 | 
| +  ; Stall
 | 
| +  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
 | 
| +  SUBGE        r12, r12, #2
 | 
| +  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
 | 
| +  ADDS         r12, r12, #1
 | 
| +  LDRGEH       r6, [r4], #2
 | 
| +  LDRGEH       r8, [r5], #2
 | 
| +  ; Stall
 | 
| +  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
 | 
| +  ; maxcorr = max(maxcorr, sum)
 | 
| +  CMP          r0, r14
 | 
| +  ; xcorr[i] = sum
 | 
| +  STR          r14, [r2], #4
 | 
| +  MOVLT        r0, r14
 | 
| +celt_pitch_xcorr_edsp_done
 | 
| +  LDMFD        sp!, {r4-r11, pc}
 | 
| +  ENDP
 | 
| +
 | 
| +ENDIF
 | 
| +
 | 
| +END
 | 
| 
 |