| OLD | NEW |
| 1 ; Copyright (c) 2007-2008 CSIRO | 1 ; Copyright (c) 2007-2008 CSIRO |
| 2 ; Copyright (c) 2007-2009 Xiph.Org Foundation | 2 ; Copyright (c) 2007-2009 Xiph.Org Foundation |
| 3 ; Copyright (c) 2013 Parrot | 3 ; Copyright (c) 2013 Parrot |
| 4 ; Written by Aurélien Zanelli | 4 ; Written by Aurélien Zanelli |
| 5 ; | 5 ; |
| 6 ; Redistribution and use in source and binary forms, with or without | 6 ; Redistribution and use in source and binary forms, with or without |
| 7 ; modification, are permitted provided that the following conditions | 7 ; modification, are permitted provided that the following conditions |
| 8 ; are met: | 8 ; are met: |
| 9 ; | 9 ; |
| 10 ; - Redistributions of source code must retain the above copyright | 10 ; - Redistributions of source code must retain the above copyright |
| (...skipping 24 matching lines...) Expand all Loading... |
| 35 ENDIF | 35 ENDIF |
| 36 | 36 |
| 37 IF OPUS_ARM_MAY_HAVE_NEON | 37 IF OPUS_ARM_MAY_HAVE_NEON |
| 38 EXPORT celt_pitch_xcorr_neon | 38 EXPORT celt_pitch_xcorr_neon |
| 39 ENDIF | 39 ENDIF |
| 40 | 40 |
| 41 IF OPUS_ARM_MAY_HAVE_NEON | 41 IF OPUS_ARM_MAY_HAVE_NEON |
| 42 | 42 |
| 43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 | 43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 |
| 44 xcorr_kernel_neon PROC | 44 xcorr_kernel_neon PROC |
| 45 xcorr_kernel_neon_start |
| 45 ; input: | 46 ; input: |
| 46 ; r3 = int len | 47 ; r3 = int len |
| 47 ; r4 = opus_val16 *x | 48 ; r4 = opus_val16 *x |
| 48 ; r5 = opus_val16 *y | 49 ; r5 = opus_val16 *y |
| 49 ; q0 = opus_val32 sum[4] | 50 ; q0 = opus_val32 sum[4] |
| 50 ; output: | 51 ; output: |
| 51 ; q0 = opus_val32 sum[4] | 52 ; q0 = opus_val32 sum[4] |
| 52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 | 53 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 |
| 53 ; internal usage: | 54 ; internal usage: |
| 54 ; r12 = int j | 55 ; r12 = int j |
| (...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 174 SUBS r6, r6, #4 | 175 SUBS r6, r6, #4 |
| 175 BLT celt_pitch_xcorr_neon_process4_done | 176 BLT celt_pitch_xcorr_neon_process4_done |
| 176 celt_pitch_xcorr_neon_process4 | 177 celt_pitch_xcorr_neon_process4 |
| 177 ; xcorr_kernel_neon parameters: | 178 ; xcorr_kernel_neon parameters: |
| 178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} | 179 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} |
| 179 MOV r4, r0 | 180 MOV r4, r0 |
| 180 MOV r5, r1 | 181 MOV r5, r1 |
| 181 VEOR q0, q0, q0 | 182 VEOR q0, q0, q0 |
| 182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. | 183 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. |
| 183 ; So we don't save/restore any other registers. | 184 ; So we don't save/restore any other registers. |
| 184 BL xcorr_kernel_neon | 185 BL xcorr_kernel_neon_start |
| 185 SUBS r6, r6, #4 | 186 SUBS r6, r6, #4 |
| 186 VST1.32 {q0}, [r2]! | 187 VST1.32 {q0}, [r2]! |
| 187 ; _y += 4 | 188 ; _y += 4 |
| 188 ADD r1, r1, #8 | 189 ADD r1, r1, #8 |
| 189 VMAX.S32 q15, q15, q0 | 190 VMAX.S32 q15, q15, q0 |
| 190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done | 191 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done |
| 191 BGE celt_pitch_xcorr_neon_process4 | 192 BGE celt_pitch_xcorr_neon_process4 |
| 192 ; We have less than 4 sums left to compute. | 193 ; We have less than 4 sums left to compute. |
| 193 celt_pitch_xcorr_neon_process4_done | 194 celt_pitch_xcorr_neon_process4_done |
| 194 ADDS r6, r6, #4 | 195 ADDS r6, r6, #4 |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 250 LDMFD sp!, {r4-r6, pc} | 251 LDMFD sp!, {r4-r6, pc} |
| 251 ENDP | 252 ENDP |
| 252 | 253 |
| 253 ENDIF | 254 ENDIF |
| 254 | 255 |
| 255 IF OPUS_ARM_MAY_HAVE_EDSP | 256 IF OPUS_ARM_MAY_HAVE_EDSP |
| 256 | 257 |
| 257 ; This will get used on ARMv7 devices without NEON, so it has been optimized | 258 ; This will get used on ARMv7 devices without NEON, so it has been optimized |
| 258 ; to take advantage of dual-issuing where possible. | 259 ; to take advantage of dual-issuing where possible. |
| 259 xcorr_kernel_edsp PROC | 260 xcorr_kernel_edsp PROC |
| 261 xcorr_kernel_edsp_start |
| 260 ; input: | 262 ; input: |
| 261 ; r3 = int len | 263 ; r3 = int len |
| 262 ; r4 = opus_val16 *_x (must be 32-bit aligned) | 264 ; r4 = opus_val16 *_x (must be 32-bit aligned) |
| 263 ; r5 = opus_val16 *_y (must be 32-bit aligned) | 265 ; r5 = opus_val16 *_y (must be 32-bit aligned) |
| 264 ; r6...r9 = opus_val32 sum[4] | 266 ; r6...r9 = opus_val32 sum[4] |
| 265 ; output: | 267 ; output: |
| 266 ; r6...r9 = opus_val32 sum[4] | 268 ; r6...r9 = opus_val32 sum[4] |
| 267 ; preserved: r0-r5 | 269 ; preserved: r0-r5 |
| 268 ; internal usage | 270 ; internal usage |
| 269 ; r2 = int j | 271 ; r2 = int j |
| (...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 | 411 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 |
| 410 SUBS r1, r1, #4 | 412 SUBS r1, r1, #4 |
| 411 BLT celt_pitch_xcorr_edsp_process2 | 413 BLT celt_pitch_xcorr_edsp_process2 |
| 412 celt_pitch_xcorr_edsp_process4 | 414 celt_pitch_xcorr_edsp_process4 |
| 413 ; xcorr_kernel_edsp parameters: | 415 ; xcorr_kernel_edsp parameters: |
| 414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} | 416 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} |
| 415 MOV r6, #0 | 417 MOV r6, #0 |
| 416 MOV r7, #0 | 418 MOV r7, #0 |
| 417 MOV r8, #0 | 419 MOV r8, #0 |
| 418 MOV r9, #0 | 420 MOV r9, #0 |
| 419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) | 421 BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) |
| 420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) | 422 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) |
| 421 CMP r0, r6 | 423 CMP r0, r6 |
| 422 ; _y+=4 | 424 ; _y+=4 |
| 423 ADD r5, r5, #8 | 425 ADD r5, r5, #8 |
| 424 MOVLT r0, r6 | 426 MOVLT r0, r6 |
| 425 CMP r0, r7 | 427 CMP r0, r7 |
| 426 MOVLT r0, r7 | 428 MOVLT r0, r7 |
| 427 CMP r0, r8 | 429 CMP r0, r8 |
| 428 MOVLT r0, r8 | 430 MOVLT r0, r8 |
| 429 CMP r0, r9 | 431 CMP r0, r9 |
| (...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 536 ; xcorr[i] = sum | 538 ; xcorr[i] = sum |
| 537 STR r14, [r2], #4 | 539 STR r14, [r2], #4 |
| 538 MOVLT r0, r14 | 540 MOVLT r0, r14 |
| 539 celt_pitch_xcorr_edsp_done | 541 celt_pitch_xcorr_edsp_done |
| 540 LDMFD sp!, {r4-r11, pc} | 542 LDMFD sp!, {r4-r11, pc} |
| 541 ENDP | 543 ENDP |
| 542 | 544 |
| 543 ENDIF | 545 ENDIF |
| 544 | 546 |
| 545 END | 547 END |
| OLD | NEW |