OLD | NEW |
1 ; Copyright (c) 2007-2008 CSIRO | 1 ; Copyright (c) 2007-2008 CSIRO |
2 ; Copyright (c) 2007-2009 Xiph.Org Foundation | 2 ; Copyright (c) 2007-2009 Xiph.Org Foundation |
3 ; Copyright (c) 2013 Parrot | 3 ; Copyright (c) 2013 Parrot |
4 ; Written by Aurélien Zanelli | 4 ; Written by Aurélien Zanelli |
5 ; | 5 ; |
6 ; Redistribution and use in source and binary forms, with or without | 6 ; Redistribution and use in source and binary forms, with or without |
7 ; modification, are permitted provided that the following conditions | 7 ; modification, are permitted provided that the following conditions |
8 ; are met: | 8 ; are met: |
9 ; | 9 ; |
10 ; - Redistributions of source code must retain the above copyright | 10 ; - Redistributions of source code must retain the above copyright |
(...skipping 24 matching lines...) Expand all Loading... |
35 ENDIF | 35 ENDIF |
36 | 36 |
37 IF OPUS_ARM_MAY_HAVE_NEON | 37 IF OPUS_ARM_MAY_HAVE_NEON |
38 EXPORT celt_pitch_xcorr_neon | 38 EXPORT celt_pitch_xcorr_neon |
39 ENDIF | 39 ENDIF |
40 | 40 |
41 IF OPUS_ARM_MAY_HAVE_NEON | 41 IF OPUS_ARM_MAY_HAVE_NEON |
42 | 42 |
43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 | 43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 |
44 xcorr_kernel_neon PROC | 44 xcorr_kernel_neon PROC |
| 45 xcorr_kernel_neon_start |
45 ; input: | 46 ; input: |
46 ; r3 = int len | 47 ; r3 = int len |
47 ; r4 = opus_val16 *x | 48 ; r4 = opus_val16 *x |
48 ; r5 = opus_val16 *y | 49 ; r5 = opus_val16 *y |
49 ; q0 = opus_val32 sum[4] | 50 ; q0 = opus_val32 sum[4] |
50 ; output: | 51 ; output: |
51 ; q0 = opus_val32 sum[4] | 52 ; q0 = opus_val32 sum[4] |
52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 | 53 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 |
53 ; internal usage: | 54 ; internal usage: |
54 ; r12 = int j | 55 ; r12 = int j |
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
174 SUBS r6, r6, #4 | 175 SUBS r6, r6, #4 |
175 BLT celt_pitch_xcorr_neon_process4_done | 176 BLT celt_pitch_xcorr_neon_process4_done |
176 celt_pitch_xcorr_neon_process4 | 177 celt_pitch_xcorr_neon_process4 |
177 ; xcorr_kernel_neon parameters: | 178 ; xcorr_kernel_neon parameters: |
178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} | 179 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} |
179 MOV r4, r0 | 180 MOV r4, r0 |
180 MOV r5, r1 | 181 MOV r5, r1 |
181 VEOR q0, q0, q0 | 182 VEOR q0, q0, q0 |
182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. | 183 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. |
183 ; So we don't save/restore any other registers. | 184 ; So we don't save/restore any other registers. |
184 BL xcorr_kernel_neon | 185 BL xcorr_kernel_neon_start |
185 SUBS r6, r6, #4 | 186 SUBS r6, r6, #4 |
186 VST1.32 {q0}, [r2]! | 187 VST1.32 {q0}, [r2]! |
187 ; _y += 4 | 188 ; _y += 4 |
188 ADD r1, r1, #8 | 189 ADD r1, r1, #8 |
189 VMAX.S32 q15, q15, q0 | 190 VMAX.S32 q15, q15, q0 |
190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done | 191 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done |
191 BGE celt_pitch_xcorr_neon_process4 | 192 BGE celt_pitch_xcorr_neon_process4 |
192 ; We have less than 4 sums left to compute. | 193 ; We have less than 4 sums left to compute. |
193 celt_pitch_xcorr_neon_process4_done | 194 celt_pitch_xcorr_neon_process4_done |
194 ADDS r6, r6, #4 | 195 ADDS r6, r6, #4 |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
250 LDMFD sp!, {r4-r6, pc} | 251 LDMFD sp!, {r4-r6, pc} |
251 ENDP | 252 ENDP |
252 | 253 |
253 ENDIF | 254 ENDIF |
254 | 255 |
255 IF OPUS_ARM_MAY_HAVE_EDSP | 256 IF OPUS_ARM_MAY_HAVE_EDSP |
256 | 257 |
257 ; This will get used on ARMv7 devices without NEON, so it has been optimized | 258 ; This will get used on ARMv7 devices without NEON, so it has been optimized |
258 ; to take advantage of dual-issuing where possible. | 259 ; to take advantage of dual-issuing where possible. |
259 xcorr_kernel_edsp PROC | 260 xcorr_kernel_edsp PROC |
| 261 xcorr_kernel_edsp_start |
260 ; input: | 262 ; input: |
261 ; r3 = int len | 263 ; r3 = int len |
262 ; r4 = opus_val16 *_x (must be 32-bit aligned) | 264 ; r4 = opus_val16 *_x (must be 32-bit aligned) |
263 ; r5 = opus_val16 *_y (must be 32-bit aligned) | 265 ; r5 = opus_val16 *_y (must be 32-bit aligned) |
264 ; r6...r9 = opus_val32 sum[4] | 266 ; r6...r9 = opus_val32 sum[4] |
265 ; output: | 267 ; output: |
266 ; r6...r9 = opus_val32 sum[4] | 268 ; r6...r9 = opus_val32 sum[4] |
267 ; preserved: r0-r5 | 269 ; preserved: r0-r5 |
268 ; internal usage | 270 ; internal usage |
269 ; r2 = int j | 271 ; r2 = int j |
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 | 411 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 |
410 SUBS r1, r1, #4 | 412 SUBS r1, r1, #4 |
411 BLT celt_pitch_xcorr_edsp_process2 | 413 BLT celt_pitch_xcorr_edsp_process2 |
412 celt_pitch_xcorr_edsp_process4 | 414 celt_pitch_xcorr_edsp_process4 |
413 ; xcorr_kernel_edsp parameters: | 415 ; xcorr_kernel_edsp parameters: |
414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} | 416 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} |
415 MOV r6, #0 | 417 MOV r6, #0 |
416 MOV r7, #0 | 418 MOV r7, #0 |
417 MOV r8, #0 | 419 MOV r8, #0 |
418 MOV r9, #0 | 420 MOV r9, #0 |
419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) | 421 BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) |
420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) | 422 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) |
421 CMP r0, r6 | 423 CMP r0, r6 |
422 ; _y+=4 | 424 ; _y+=4 |
423 ADD r5, r5, #8 | 425 ADD r5, r5, #8 |
424 MOVLT r0, r6 | 426 MOVLT r0, r6 |
425 CMP r0, r7 | 427 CMP r0, r7 |
426 MOVLT r0, r7 | 428 MOVLT r0, r7 |
427 CMP r0, r8 | 429 CMP r0, r8 |
428 MOVLT r0, r8 | 430 MOVLT r0, r8 |
429 CMP r0, r9 | 431 CMP r0, r9 |
(...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
536 ; xcorr[i] = sum | 538 ; xcorr[i] = sum |
537 STR r14, [r2], #4 | 539 STR r14, [r2], #4 |
538 MOVLT r0, r14 | 540 MOVLT r0, r14 |
539 celt_pitch_xcorr_edsp_done | 541 celt_pitch_xcorr_edsp_done |
540 LDMFD sp!, {r4-r11, pc} | 542 LDMFD sp!, {r4-r11, pc} |
541 ENDP | 543 ENDP |
542 | 544 |
543 ENDIF | 545 ENDIF |
544 | 546 |
545 END | 547 END |
OLD | NEW |