OLD | NEW |
1 /* Copyright (c) 2014, Cisco Systems, INC | 1 /* Copyright (c) 2014, Cisco Systems, INC |
2 Written by XiangMingZhu WeiZhou MinPeng YanWang | 2 Written by XiangMingZhu WeiZhou MinPeng YanWang |
3 | 3 |
4 Redistribution and use in source and binary forms, with or without | 4 Redistribution and use in source and binary forms, with or without |
5 modification, are permitted provided that the following conditions | 5 modification, are permitted provided that the following conditions |
6 are met: | 6 are met: |
7 | 7 |
8 - Redistributions of source code must retain the above copyright | 8 - Redistributions of source code must retain the above copyright |
9 notice, this list of conditions and the following disclaimer. | 9 notice, this list of conditions and the following disclaimer. |
10 | 10 |
(...skipping 22 matching lines...) Expand all Loading... |
33 #include <emmintrin.h> | 33 #include <emmintrin.h> |
34 #include <smmintrin.h> | 34 #include <smmintrin.h> |
35 #include "celt_lpc.h" | 35 #include "celt_lpc.h" |
36 #include "stack_alloc.h" | 36 #include "stack_alloc.h" |
37 #include "mathops.h" | 37 #include "mathops.h" |
38 #include "pitch.h" | 38 #include "pitch.h" |
39 #include "x86cpu.h" | 39 #include "x86cpu.h" |
40 | 40 |
41 #if defined(FIXED_POINT) | 41 #if defined(FIXED_POINT) |
42 | 42 |
43 void celt_fir_sse4_1(const opus_val16 *_x, | 43 void celt_fir_sse4_1(const opus_val16 *x, |
44 const opus_val16 *num, | 44 const opus_val16 *num, |
45 opus_val16 *_y, | 45 opus_val16 *y, |
46 int N, | 46 int N, |
47 int ord, | 47 int ord, |
48 opus_val16 *mem, | |
49 int arch) | 48 int arch) |
50 { | 49 { |
51 int i,j; | 50 int i,j; |
52 VARDECL(opus_val16, rnum); | 51 VARDECL(opus_val16, rnum); |
53 VARDECL(opus_val16, x); | |
54 | 52 |
55 __m128i vecNoA; | 53 __m128i vecNoA; |
56 opus_int32 noA ; | 54 opus_int32 noA ; |
57 SAVE_STACK; | 55 SAVE_STACK; |
58 | 56 |
59 ALLOC(rnum, ord, opus_val16); | 57 ALLOC(rnum, ord, opus_val16); |
60 ALLOC(x, N+ord, opus_val16); | |
61 for(i=0;i<ord;i++) | 58 for(i=0;i<ord;i++) |
62 rnum[i] = num[ord-i-1]; | 59 rnum[i] = num[ord-i-1]; |
63 for(i=0;i<ord;i++) | |
64 x[i] = mem[ord-i-1]; | |
65 | |
66 for (i=0;i<N-7;i+=8) | |
67 { | |
68 x[i+ord ]=_x[i ]; | |
69 x[i+ord+1]=_x[i+1]; | |
70 x[i+ord+2]=_x[i+2]; | |
71 x[i+ord+3]=_x[i+3]; | |
72 x[i+ord+4]=_x[i+4]; | |
73 x[i+ord+5]=_x[i+5]; | |
74 x[i+ord+6]=_x[i+6]; | |
75 x[i+ord+7]=_x[i+7]; | |
76 } | |
77 | |
78 for (;i<N-3;i+=4) | |
79 { | |
80 x[i+ord ]=_x[i ]; | |
81 x[i+ord+1]=_x[i+1]; | |
82 x[i+ord+2]=_x[i+2]; | |
83 x[i+ord+3]=_x[i+3]; | |
84 } | |
85 | |
86 for (;i<N;i++) | |
87 x[i+ord]=_x[i]; | |
88 | |
89 for(i=0;i<ord;i++) | |
90 mem[i] = _x[N-i-1]; | |
91 #ifdef SMALL_FOOTPRINT | |
92 for (i=0;i<N;i++) | |
93 { | |
94 opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); | |
95 for (j=0;j<ord;j++) | |
96 { | |
97 sum = MAC16_16(sum,rnum[j],x[i+j]); | |
98 } | |
99 _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); | |
100 } | |
101 #else | |
102 noA = EXTEND32(1) << SIG_SHIFT >> 1; | 60 noA = EXTEND32(1) << SIG_SHIFT >> 1; |
103 vecNoA = _mm_set_epi32(noA, noA, noA, noA); | 61 vecNoA = _mm_set_epi32(noA, noA, noA, noA); |
104 | 62 |
105 for (i=0;i<N-3;i+=4) | 63 for (i=0;i<N-3;i+=4) |
106 { | 64 { |
107 opus_val32 sums[4] = {0}; | 65 opus_val32 sums[4] = {0}; |
108 __m128i vecSum, vecX; | 66 __m128i vecSum, vecX; |
109 | 67 |
110 xcorr_kernel(rnum, x+i, sums, ord, arch); | 68 xcorr_kernel(rnum, x+i-ord, sums, ord, arch); |
111 | 69 |
112 vecSum = _mm_loadu_si128((__m128i *)sums); | 70 vecSum = _mm_loadu_si128((__m128i *)sums); |
113 vecSum = _mm_add_epi32(vecSum, vecNoA); | 71 vecSum = _mm_add_epi32(vecSum, vecNoA); |
114 vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); | 72 vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); |
115 vecX = OP_CVTEPI16_EPI32_M64(_x + i); | 73 vecX = OP_CVTEPI16_EPI32_M64(x + i); |
116 vecSum = _mm_add_epi32(vecSum, vecX); | 74 vecSum = _mm_add_epi32(vecSum, vecX); |
117 vecSum = _mm_packs_epi32(vecSum, vecSum); | 75 vecSum = _mm_packs_epi32(vecSum, vecSum); |
118 _mm_storel_epi64((__m128i *)(_y + i), vecSum); | 76 _mm_storel_epi64((__m128i *)(y + i), vecSum); |
119 } | 77 } |
120 for (;i<N;i++) | 78 for (;i<N;i++) |
121 { | 79 { |
122 opus_val32 sum = 0; | 80 opus_val32 sum = 0; |
123 for (j=0;j<ord;j++) | 81 for (j=0;j<ord;j++) |
124 sum = MAC16_16(sum, rnum[j], x[i + j]); | 82 sum = MAC16_16(sum, rnum[j], x[i+j-ord]); |
125 _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); | 83 y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT))); |
126 } | 84 } |
127 | 85 |
128 #endif | |
129 RESTORE_STACK; | 86 RESTORE_STACK; |
130 } | 87 } |
131 | 88 |
132 #endif | 89 #endif |
OLD | NEW |