OLD | NEW |
(Empty) | |
| 1 ; Copyright (c) 2007-2008 CSIRO |
| 2 ; Copyright (c) 2007-2009 Xiph.Org Foundation |
| 3 ; Copyright (c) 2013 Parrot |
| 4 ; Written by Aurélien Zanelli |
| 5 ; |
| 6 ; Redistribution and use in source and binary forms, with or without |
| 7 ; modification, are permitted provided that the following conditions |
| 8 ; are met: |
| 9 ; |
| 10 ; - Redistributions of source code must retain the above copyright |
| 11 ; notice, this list of conditions and the following disclaimer. |
| 12 ; |
| 13 ; - Redistributions in binary form must reproduce the above copyright |
| 14 ; notice, this list of conditions and the following disclaimer in the |
| 15 ; documentation and/or other materials provided with the distribution. |
| 16 ; |
| 17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
| 21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 28 |
| 29 AREA |.text|, CODE, READONLY |
| 30 |
| 31 GET celt/arm/armopts.s |
| 32 |
| 33 IF OPUS_ARM_MAY_HAVE_EDSP |
| 34 EXPORT celt_pitch_xcorr_edsp |
| 35 ENDIF |
| 36 |
| 37 IF OPUS_ARM_MAY_HAVE_NEON |
| 38 EXPORT celt_pitch_xcorr_neon |
| 39 ENDIF |
| 40 |
| 41 IF OPUS_ARM_MAY_HAVE_NEON |
| 42 |
| 43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 |
| 44 xcorr_kernel_neon PROC |
| 45 ; input: |
| 46 ; r3 = int len |
| 47 ; r4 = opus_val16 *x |
| 48 ; r5 = opus_val16 *y |
| 49 ; q0 = opus_val32 sum[4] |
| 50 ; output: |
| 51 ; q0 = opus_val32 sum[4] |
| 52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 |
| 53 ; internal usage: |
| 54 ; r12 = int j |
| 55 ; d3 = y_3|y_2|y_1|y_0 |
| 56 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 |
| 57 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 |
| 58 ; q8 = scratch |
| 59 ; |
| 60 ; Load y[0...3] |
| 61 ; This requires len>0 to always be valid (which we assert in the C code). |
| 62 VLD1.16 {d5}, [r5]! |
| 63 SUBS r12, r3, #8 |
| 64 BLE xcorr_kernel_neon_process4 |
| 65 ; Process 8 samples at a time. |
| 66 ; This loop loads one y value more than we actually need. Therefore we have to |
| 67 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid |
| 68 ; reading past the end of the array. |
| 69 xcorr_kernel_neon_process8 |
| 70 ; This loop has 19 total instructions (10 cycles to issue, minimum), with |
| 71 ; - 2 cycles of ARM insrtuctions, |
| 72 ; - 10 cycles of load/store/byte permute instructions, and |
| 73 ; - 9 cycles of data processing instructions. |
| 74 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the |
| 75 ; latter two categories, meaning the whole loop should run in 10 cycles per |
| 76 ; iteration, barring cache misses. |
| 77 ; |
| 78 ; Load x[0...7] |
| 79 VLD1.16 {d6, d7}, [r4]! |
| 80 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get |
| 81 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. |
| 82 VAND d3, d5, d5 |
| 83 SUBS r12, r12, #8 |
| 84 ; Load y[4...11] |
| 85 VLD1.16 {d4, d5}, [r5]! |
| 86 VMLAL.S16 q0, d3, d6[0] |
| 87 VEXT.16 d16, d3, d4, #1 |
| 88 VMLAL.S16 q0, d4, d7[0] |
| 89 VEXT.16 d17, d4, d5, #1 |
| 90 VMLAL.S16 q0, d16, d6[1] |
| 91 VEXT.16 d16, d3, d4, #2 |
| 92 VMLAL.S16 q0, d17, d7[1] |
| 93 VEXT.16 d17, d4, d5, #2 |
| 94 VMLAL.S16 q0, d16, d6[2] |
| 95 VEXT.16 d16, d3, d4, #3 |
| 96 VMLAL.S16 q0, d17, d7[2] |
| 97 VEXT.16 d17, d4, d5, #3 |
| 98 VMLAL.S16 q0, d16, d6[3] |
| 99 VMLAL.S16 q0, d17, d7[3] |
| 100 BGT xcorr_kernel_neon_process8 |
| 101 ; Process 4 samples here if we have > 4 left (still reading one extra y value). |
| 102 xcorr_kernel_neon_process4 |
| 103 ADDS r12, r12, #4 |
| 104 BLE xcorr_kernel_neon_process2 |
| 105 ; Load x[0...3] |
| 106 VLD1.16 d6, [r4]! |
| 107 ; Use VAND since it's a data processing instruction again. |
| 108 VAND d4, d5, d5 |
| 109 SUB r12, r12, #4 |
| 110 ; Load y[4...7] |
| 111 VLD1.16 d5, [r5]! |
| 112 VMLAL.S16 q0, d4, d6[0] |
| 113 VEXT.16 d16, d4, d5, #1 |
| 114 VMLAL.S16 q0, d16, d6[1] |
| 115 VEXT.16 d16, d4, d5, #2 |
| 116 VMLAL.S16 q0, d16, d6[2] |
| 117 VEXT.16 d16, d4, d5, #3 |
| 118 VMLAL.S16 q0, d16, d6[3] |
| 119 ; Process 2 samples here if we have > 2 left (still reading one extra y value). |
| 120 xcorr_kernel_neon_process2 |
| 121 ADDS r12, r12, #2 |
| 122 BLE xcorr_kernel_neon_process1 |
| 123 ; Load x[0...1] |
| 124 VLD2.16 {d6[],d7[]}, [r4]! |
| 125 ; Use VAND since it's a data processing instruction again. |
| 126 VAND d4, d5, d5 |
| 127 SUB r12, r12, #2 |
| 128 ; Load y[4...5] |
| 129 VLD1.32 {d5[]}, [r5]! |
| 130 VMLAL.S16 q0, d4, d6 |
| 131 VEXT.16 d16, d4, d5, #1 |
| 132 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI |
| 133 ; instead of VEXT, since it's a data-processing instruction. |
| 134 VSRI.64 d5, d4, #32 |
| 135 VMLAL.S16 q0, d16, d7 |
| 136 ; Process 1 sample using the extra y value we loaded above. |
| 137 xcorr_kernel_neon_process1 |
| 138 ; Load next *x |
| 139 VLD1.16 {d6[]}, [r4]! |
| 140 ADDS r12, r12, #1 |
| 141 ; y[0...3] are left in d5 from prior iteration(s) (if any) |
| 142 VMLAL.S16 q0, d5, d6 |
| 143 MOVLE pc, lr |
| 144 ; Now process 1 last sample, not reading ahead. |
| 145 ; Load last *y |
| 146 VLD1.16 {d4[]}, [r5]! |
| 147 VSRI.64 d4, d5, #16 |
| 148 ; Load last *x |
| 149 VLD1.16 {d6[]}, [r4]! |
| 150 VMLAL.S16 q0, d4, d6 |
| 151 MOV pc, lr |
| 152 ENDP |
| 153 |
| 154 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, |
| 155 ; opus_val32 *xcorr, int len, int max_pitch) |
| 156 celt_pitch_xcorr_neon PROC |
| 157 ; input: |
| 158 ; r0 = opus_val16 *_x |
| 159 ; r1 = opus_val16 *_y |
| 160 ; r2 = opus_val32 *xcorr |
| 161 ; r3 = int len |
| 162 ; output: |
| 163 ; r0 = int maxcorr |
| 164 ; internal usage: |
| 165 ; r4 = opus_val16 *x (for xcorr_kernel_neon()) |
| 166 ; r5 = opus_val16 *y (for xcorr_kernel_neon()) |
| 167 ; r6 = int max_pitch |
| 168 ; r12 = int j |
| 169 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) |
| 170 STMFD sp!, {r4-r6, lr} |
| 171 LDR r6, [sp, #16] |
| 172 VMOV.S32 q15, #1 |
| 173 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done |
| 174 SUBS r6, r6, #4 |
| 175 BLT celt_pitch_xcorr_neon_process4_done |
| 176 celt_pitch_xcorr_neon_process4 |
| 177 ; xcorr_kernel_neon parameters: |
| 178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} |
| 179 MOV r4, r0 |
| 180 MOV r5, r1 |
| 181 VEOR q0, q0, q0 |
| 182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. |
| 183 ; So we don't save/restore any other registers. |
| 184 BL xcorr_kernel_neon |
| 185 SUBS r6, r6, #4 |
| 186 VST1.32 {q0}, [r2]! |
| 187 ; _y += 4 |
| 188 ADD r1, r1, #8 |
| 189 VMAX.S32 q15, q15, q0 |
| 190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done |
| 191 BGE celt_pitch_xcorr_neon_process4 |
| 192 ; We have less than 4 sums left to compute. |
| 193 celt_pitch_xcorr_neon_process4_done |
| 194 ADDS r6, r6, #4 |
| 195 ; Reduce maxcorr to a single value |
| 196 VMAX.S32 d30, d30, d31 |
| 197 VPMAX.S32 d30, d30, d30 |
| 198 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done |
| 199 BLE celt_pitch_xcorr_neon_done |
| 200 ; Now compute each remaining sum one at a time. |
| 201 celt_pitch_xcorr_neon_process_remaining |
| 202 MOV r4, r0 |
| 203 MOV r5, r1 |
| 204 VMOV.I32 q0, #0 |
| 205 SUBS r12, r3, #8 |
| 206 BLT celt_pitch_xcorr_neon_process_remaining4 |
| 207 ; Sum terms 8 at a time. |
| 208 celt_pitch_xcorr_neon_process_remaining_loop8 |
| 209 ; Load x[0...7] |
| 210 VLD1.16 {q1}, [r4]! |
| 211 ; Load y[0...7] |
| 212 VLD1.16 {q2}, [r5]! |
| 213 SUBS r12, r12, #8 |
| 214 VMLAL.S16 q0, d4, d2 |
| 215 VMLAL.S16 q0, d5, d3 |
| 216 BGE celt_pitch_xcorr_neon_process_remaining_loop8 |
| 217 ; Sum terms 4 at a time. |
| 218 celt_pitch_xcorr_neon_process_remaining4 |
| 219 ADDS r12, r12, #4 |
| 220 BLT celt_pitch_xcorr_neon_process_remaining4_done |
| 221 ; Load x[0...3] |
| 222 VLD1.16 {d2}, [r4]! |
| 223 ; Load y[0...3] |
| 224 VLD1.16 {d3}, [r5]! |
| 225 SUB r12, r12, #4 |
| 226 VMLAL.S16 q0, d3, d2 |
| 227 celt_pitch_xcorr_neon_process_remaining4_done |
| 228 ; Reduce the sum to a single value. |
| 229 VADD.S32 d0, d0, d1 |
| 230 VPADDL.S32 d0, d0 |
| 231 ADDS r12, r12, #4 |
| 232 BLE celt_pitch_xcorr_neon_process_remaining_loop_done |
| 233 ; Sum terms 1 at a time. |
| 234 celt_pitch_xcorr_neon_process_remaining_loop1 |
| 235 VLD1.16 {d2[]}, [r4]! |
| 236 VLD1.16 {d3[]}, [r5]! |
| 237 SUBS r12, r12, #1 |
| 238 VMLAL.S16 q0, d2, d3 |
| 239 BGT celt_pitch_xcorr_neon_process_remaining_loop1 |
| 240 celt_pitch_xcorr_neon_process_remaining_loop_done |
| 241 VST1.32 {d0[0]}, [r2]! |
| 242 VMAX.S32 d30, d30, d0 |
| 243 SUBS r6, r6, #1 |
| 244 ; _y++ |
| 245 ADD r1, r1, #2 |
| 246 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining |
| 247 BGT celt_pitch_xcorr_neon_process_remaining |
| 248 celt_pitch_xcorr_neon_done |
| 249 VMOV.32 r0, d30[0] |
| 250 LDMFD sp!, {r4-r6, pc} |
| 251 ENDP |
| 252 |
| 253 ENDIF |
| 254 |
| 255 IF OPUS_ARM_MAY_HAVE_EDSP |
| 256 |
| 257 ; This will get used on ARMv7 devices without NEON, so it has been optimized |
| 258 ; to take advantage of dual-issuing where possible. |
| 259 xcorr_kernel_edsp PROC |
| 260 ; input: |
| 261 ; r3 = int len |
| 262 ; r4 = opus_val16 *_x (must be 32-bit aligned) |
| 263 ; r5 = opus_val16 *_y (must be 32-bit aligned) |
| 264 ; r6...r9 = opus_val32 sum[4] |
| 265 ; output: |
| 266 ; r6...r9 = opus_val32 sum[4] |
| 267 ; preserved: r0-r5 |
| 268 ; internal usage |
| 269 ; r2 = int j |
| 270 ; r12,r14 = opus_val16 x[4] |
| 271 ; r10,r11 = opus_val16 y[4] |
| 272 STMFD sp!, {r2,r4,r5,lr} |
| 273 LDR r10, [r5], #4 ; Load y[0...1] |
| 274 SUBS r2, r3, #4 ; j = len-4 |
| 275 LDR r11, [r5], #4 ; Load y[2...3] |
| 276 BLE xcorr_kernel_edsp_process4_done |
| 277 LDR r12, [r4], #4 ; Load x[0...1] |
| 278 ; Stall |
| 279 xcorr_kernel_edsp_process4 |
| 280 ; The multiplies must issue from pipeline 0, and can't dual-issue with each |
| 281 ; other. Every other instruction here dual-issues with a multiply, and is |
| 282 ; thus "free". There should be no stalls in the body of the loop. |
| 283 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) |
| 284 LDR r14, [r4], #4 ; Load x[2...3] |
| 285 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) |
| 286 SUBS r2, r2, #4 ; j-=4 |
| 287 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) |
| 288 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) |
| 289 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) |
| 290 LDR r10, [r5], #4 ; Load y[4...5] |
| 291 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) |
| 292 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) |
| 293 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) |
| 294 LDRGT r12, [r4], #4 ; Load x[0...1] |
| 295 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) |
| 296 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) |
| 297 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) |
| 298 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) |
| 299 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) |
| 300 LDR r11, [r5], #4 ; Load y[6...7] |
| 301 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) |
| 302 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) |
| 303 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) |
| 304 BGT xcorr_kernel_edsp_process4 |
| 305 xcorr_kernel_edsp_process4_done |
| 306 ADDS r2, r2, #4 |
| 307 BLE xcorr_kernel_edsp_done |
| 308 LDRH r12, [r4], #2 ; r12 = *x++ |
| 309 SUBS r2, r2, #1 ; j-- |
| 310 ; Stall |
| 311 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) |
| 312 LDRGTH r14, [r4], #2 ; r14 = *x++ |
| 313 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) |
| 314 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) |
| 315 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) |
| 316 BLE xcorr_kernel_edsp_done |
| 317 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) |
| 318 SUBS r2, r2, #1 ; j-- |
| 319 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) |
| 320 LDRH r10, [r5], #2 ; r10 = y_4 = *y++ |
| 321 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) |
| 322 LDRGTH r12, [r4], #2 ; r12 = *x++ |
| 323 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) |
| 324 BLE xcorr_kernel_edsp_done |
| 325 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) |
| 326 CMP r2, #1 ; j-- |
| 327 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) |
| 328 LDRH r2, [r5], #2 ; r2 = y_5 = *y++ |
| 329 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) |
| 330 LDRGTH r14, [r4] ; r14 = *x |
| 331 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) |
| 332 BLE xcorr_kernel_edsp_done |
| 333 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) |
| 334 LDRH r11, [r5] ; r11 = y_6 = *y |
| 335 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) |
| 336 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) |
| 337 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) |
| 338 xcorr_kernel_edsp_done |
| 339 LDMFD sp!, {r2,r4,r5,pc} |
| 340 ENDP |
| 341 |
| 342 celt_pitch_xcorr_edsp PROC |
| 343 ; input: |
| 344 ; r0 = opus_val16 *_x (must be 32-bit aligned) |
| 345 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) |
| 346 ; r2 = opus_val32 *xcorr |
| 347 ; r3 = int len |
| 348 ; output: |
| 349 ; r0 = maxcorr |
| 350 ; internal usage |
| 351 ; r4 = opus_val16 *x |
| 352 ; r5 = opus_val16 *y |
| 353 ; r6 = opus_val32 sum0 |
| 354 ; r7 = opus_val32 sum1 |
| 355 ; r8 = opus_val32 sum2 |
| 356 ; r9 = opus_val32 sum3 |
| 357 ; r1 = int max_pitch |
| 358 ; r12 = int j |
| 359 STMFD sp!, {r4-r11, lr} |
| 360 MOV r5, r1 |
| 361 LDR r1, [sp, #36] |
| 362 MOV r4, r0 |
| 363 TST r5, #3 |
| 364 ; maxcorr = 1 |
| 365 MOV r0, #1 |
| 366 BEQ celt_pitch_xcorr_edsp_process1u_done |
| 367 ; Compute one sum at the start to make y 32-bit aligned. |
| 368 SUBS r12, r3, #4 |
| 369 ; r14 = sum = 0 |
| 370 MOV r14, #0 |
| 371 LDRH r8, [r5], #2 |
| 372 BLE celt_pitch_xcorr_edsp_process1u_loop4_done |
| 373 LDR r6, [r4], #4 |
| 374 MOV r8, r8, LSL #16 |
| 375 celt_pitch_xcorr_edsp_process1u_loop4 |
| 376 LDR r9, [r5], #4 |
| 377 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) |
| 378 LDR r7, [r4], #4 |
| 379 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) |
| 380 LDR r8, [r5], #4 |
| 381 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) |
| 382 SUBS r12, r12, #4 ; j-=4 |
| 383 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3) |
| 384 LDRGT r6, [r4], #4 |
| 385 BGT celt_pitch_xcorr_edsp_process1u_loop4 |
| 386 MOV r8, r8, LSR #16 |
| 387 celt_pitch_xcorr_edsp_process1u_loop4_done |
| 388 ADDS r12, r12, #4 |
| 389 celt_pitch_xcorr_edsp_process1u_loop1 |
| 390 LDRGEH r6, [r4], #2 |
| 391 ; Stall |
| 392 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) |
| 393 SUBGES r12, r12, #1 |
| 394 LDRGTH r8, [r5], #2 |
| 395 BGT celt_pitch_xcorr_edsp_process1u_loop1 |
| 396 ; Restore _x |
| 397 SUB r4, r4, r3, LSL #1 |
| 398 ; Restore and advance _y |
| 399 SUB r5, r5, r3, LSL #1 |
| 400 ; maxcorr = max(maxcorr, sum) |
| 401 CMP r0, r14 |
| 402 ADD r5, r5, #2 |
| 403 MOVLT r0, r14 |
| 404 SUBS r1, r1, #1 |
| 405 ; xcorr[i] = sum |
| 406 STR r14, [r2], #4 |
| 407 BLE celt_pitch_xcorr_edsp_done |
| 408 celt_pitch_xcorr_edsp_process1u_done |
| 409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 |
| 410 SUBS r1, r1, #4 |
| 411 BLT celt_pitch_xcorr_edsp_process2 |
| 412 celt_pitch_xcorr_edsp_process4 |
| 413 ; xcorr_kernel_edsp parameters: |
| 414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} |
| 415 MOV r6, #0 |
| 416 MOV r7, #0 |
| 417 MOV r8, #0 |
| 418 MOV r9, #0 |
| 419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) |
| 420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) |
| 421 CMP r0, r6 |
| 422 ; _y+=4 |
| 423 ADD r5, r5, #8 |
| 424 MOVLT r0, r6 |
| 425 CMP r0, r7 |
| 426 MOVLT r0, r7 |
| 427 CMP r0, r8 |
| 428 MOVLT r0, r8 |
| 429 CMP r0, r9 |
| 430 MOVLT r0, r9 |
| 431 STMIA r2!, {r6-r9} |
| 432 SUBS r1, r1, #4 |
| 433 BGE celt_pitch_xcorr_edsp_process4 |
| 434 celt_pitch_xcorr_edsp_process2 |
| 435 ADDS r1, r1, #2 |
| 436 BLT celt_pitch_xcorr_edsp_process1a |
| 437 SUBS r12, r3, #4 |
| 438 ; {r10, r11} = {sum0, sum1} = {0, 0} |
| 439 MOV r10, #0 |
| 440 MOV r11, #0 |
| 441 LDR r8, [r5], #4 |
| 442 BLE celt_pitch_xcorr_edsp_process2_loop_done |
| 443 LDR r6, [r4], #4 |
| 444 LDR r9, [r5], #4 |
| 445 celt_pitch_xcorr_edsp_process2_loop4 |
| 446 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) |
| 447 LDR r7, [r4], #4 |
| 448 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) |
| 449 SUBS r12, r12, #4 ; j-=4 |
| 450 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) |
| 451 LDR r8, [r5], #4 |
| 452 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) |
| 453 LDRGT r6, [r4], #4 |
| 454 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) |
| 455 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) |
| 456 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) |
| 457 LDRGT r9, [r5], #4 |
| 458 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) |
| 459 BGT celt_pitch_xcorr_edsp_process2_loop4 |
| 460 celt_pitch_xcorr_edsp_process2_loop_done |
| 461 ADDS r12, r12, #2 |
| 462 BLE celt_pitch_xcorr_edsp_process2_1 |
| 463 LDR r6, [r4], #4 |
| 464 ; Stall |
| 465 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) |
| 466 LDR r9, [r5], #4 |
| 467 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) |
| 468 SUB r12, r12, #2 |
| 469 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) |
| 470 MOV r8, r9 |
| 471 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) |
| 472 celt_pitch_xcorr_edsp_process2_1 |
| 473 LDRH r6, [r4], #2 |
| 474 ADDS r12, r12, #1 |
| 475 ; Stall |
| 476 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) |
| 477 LDRGTH r7, [r4], #2 |
| 478 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) |
| 479 BLE celt_pitch_xcorr_edsp_process2_done |
| 480 LDRH r9, [r5], #2 |
| 481 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) |
| 482 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) |
| 483 celt_pitch_xcorr_edsp_process2_done |
| 484 ; Restore _x |
| 485 SUB r4, r4, r3, LSL #1 |
| 486 ; Restore and advance _y |
| 487 SUB r5, r5, r3, LSL #1 |
| 488 ; maxcorr = max(maxcorr, sum0) |
| 489 CMP r0, r10 |
| 490 ADD r5, r5, #2 |
| 491 MOVLT r0, r10 |
| 492 SUB r1, r1, #2 |
| 493 ; maxcorr = max(maxcorr, sum1) |
| 494 CMP r0, r11 |
| 495 ; xcorr[i] = sum |
| 496 STR r10, [r2], #4 |
| 497 MOVLT r0, r11 |
| 498 STR r11, [r2], #4 |
| 499 celt_pitch_xcorr_edsp_process1a |
| 500 ADDS r1, r1, #1 |
| 501 BLT celt_pitch_xcorr_edsp_done |
| 502 SUBS r12, r3, #4 |
| 503 ; r14 = sum = 0 |
| 504 MOV r14, #0 |
| 505 BLT celt_pitch_xcorr_edsp_process1a_loop_done |
| 506 LDR r6, [r4], #4 |
| 507 LDR r8, [r5], #4 |
| 508 LDR r7, [r4], #4 |
| 509 LDR r9, [r5], #4 |
| 510 celt_pitch_xcorr_edsp_process1a_loop4 |
| 511 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) |
| 512 SUBS r12, r12, #4 ; j-=4 |
| 513 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) |
| 514 LDRGE r6, [r4], #4 |
| 515 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) |
| 516 LDRGE r8, [r5], #4 |
| 517 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) |
| 518 LDRGE r7, [r4], #4 |
| 519 LDRGE r9, [r5], #4 |
| 520 BGE celt_pitch_xcorr_edsp_process1a_loop4 |
| 521 celt_pitch_xcorr_edsp_process1a_loop_done |
| 522 ADDS r12, r12, #2 |
| 523 LDRGE r6, [r4], #4 |
| 524 LDRGE r8, [r5], #4 |
| 525 ; Stall |
| 526 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) |
| 527 SUBGE r12, r12, #2 |
| 528 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) |
| 529 ADDS r12, r12, #1 |
| 530 LDRGEH r6, [r4], #2 |
| 531 LDRGEH r8, [r5], #2 |
| 532 ; Stall |
| 533 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) |
| 534 ; maxcorr = max(maxcorr, sum) |
| 535 CMP r0, r14 |
| 536 ; xcorr[i] = sum |
| 537 STR r14, [r2], #4 |
| 538 MOVLT r0, r14 |
| 539 celt_pitch_xcorr_edsp_done |
| 540 LDMFD sp!, {r4-r11, pc} |
| 541 ENDP |
| 542 |
| 543 ENDIF |
| 544 |
| 545 END |
OLD | NEW |