OLD | NEW |
1 /* | 1 /* |
2 * The copyright in this software is being made available under the 2-clauses | 2 * The copyright in this software is being made available under the 2-clauses |
3 * BSD License, included below. This software may be subject to other third | 3 * BSD License, included below. This software may be subject to other third |
4 * party and contributor rights, including patent rights, and no such rights | 4 * party and contributor rights, including patent rights, and no such rights |
5 * are granted under this license. | 5 * are granted under this license. |
6 * | 6 * |
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium | 7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium |
8 * Copyright (c) 2002-2014, Professor Benoit Macq | 8 * Copyright (c) 2002-2014, Professor Benoit Macq |
9 * Copyright (c) 2001-2003, David Janssens | 9 * Copyright (c) 2001-2003, David Janssens |
10 * Copyright (c) 2002-2003, Yannick Verschueren | 10 * Copyright (c) 2002-2003, Yannick Verschueren |
(...skipping 132 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
143 /* Inverse irreversible MCT. */ | 143 /* Inverse irreversible MCT. */ |
144 /* </summary> */ | 144 /* </summary> */ |
145 void opj_mct_decode_real( | 145 void opj_mct_decode_real( |
146 OPJ_FLOAT32* restrict c0, | 146 OPJ_FLOAT32* restrict c0, |
147 OPJ_FLOAT32* restrict c1, | 147 OPJ_FLOAT32* restrict c1, |
148 OPJ_FLOAT32* restrict c2, | 148 OPJ_FLOAT32* restrict c2, |
149 OPJ_UINT32 n) | 149 OPJ_UINT32 n) |
150 { | 150 { |
151 OPJ_UINT32 i; | 151 OPJ_UINT32 i; |
152 #ifdef __SSE__ | 152 #ifdef __SSE__ |
153 » // Mantis BUGID: 0056291. The address must be 16-byte aligned. | 153 __m128 vrv, vgu, vgv, vbu; |
154 » // TestFile: fuzz-signal_sigsegv_6e9e7f_5076_5265.pdf | 154 vrv = _mm_set1_ps(1.402f); |
155 » if ((OPJ_UINT32)c0 % 16 == 0 && (OPJ_UINT32)c1 % 16 == 0 && (OPJ_UINT32)
c2 % 16 == 0){ | 155 vgu = _mm_set1_ps(0.34413f); |
156 » » __m128 vrv, vgu, vgv, vbu; | 156 vgv = _mm_set1_ps(0.71414f); |
157 » » vrv = _mm_set1_ps(1.402f); | 157 vbu = _mm_set1_ps(1.772f); |
158 » » vgu = _mm_set1_ps(0.34413f); | 158 for (i = 0; i < (n >> 3); ++i) { |
159 » » vgv = _mm_set1_ps(0.71414f); | 159 __m128 vy, vu, vv; |
160 » » vbu = _mm_set1_ps(1.772f); | 160 __m128 vr, vg, vb; |
161 » » for (i = 0; i < (n >> 3); ++i) { | |
162 » » » __m128 vy, vu, vv; | |
163 » » » __m128 vr, vg, vb; | |
164 | 161 |
165 » » » vy = _mm_load_ps(c0); | 162 vy = _mm_load_ps(c0); |
166 » » » vu = _mm_load_ps(c1); | 163 vu = _mm_load_ps(c1); |
167 » » » vv = _mm_load_ps(c2); | 164 vv = _mm_load_ps(c2); |
168 » » » vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); | 165 vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); |
169 » » » vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm
_mul_ps(vv, vgv)); | 166 vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)
); |
170 » » » vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); | 167 vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); |
171 » » » _mm_store_ps(c0, vr); | 168 _mm_store_ps(c0, vr); |
172 » » » _mm_store_ps(c1, vg); | 169 _mm_store_ps(c1, vg); |
173 » » » _mm_store_ps(c2, vb); | 170 _mm_store_ps(c2, vb); |
174 » » » c0 += 4; | 171 c0 += 4; |
175 » » » c1 += 4; | 172 c1 += 4; |
176 » » » c2 += 4; | 173 c2 += 4; |
177 | 174 |
178 » » » vy = _mm_load_ps(c0); | 175 vy = _mm_load_ps(c0); |
179 » » » vu = _mm_load_ps(c1); | 176 vu = _mm_load_ps(c1); |
180 » » » vv = _mm_load_ps(c2); | 177 vv = _mm_load_ps(c2); |
181 » » » vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); | 178 vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); |
182 » » » vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm
_mul_ps(vv, vgv)); | 179 vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)
); |
183 » » » vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); | 180 vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); |
184 » » » _mm_store_ps(c0, vr); | 181 _mm_store_ps(c0, vr); |
185 » » » _mm_store_ps(c1, vg); | 182 _mm_store_ps(c1, vg); |
186 » » » _mm_store_ps(c2, vb); | 183 _mm_store_ps(c2, vb); |
187 » » » c0 += 4; | 184 c0 += 4; |
188 » » » c1 += 4; | 185 c1 += 4; |
189 » » » c2 += 4; | 186 c2 += 4; |
190 » » } | 187 } |
191 » » n &= 7; | 188 n &= 7; |
192 » } else { | 189 |
193 » » for (i = 0; i < n; ++i) { | |
194 » » » OPJ_FLOAT32 y = c0[i]; | |
195 » » » OPJ_FLOAT32 u = c1[i]; | |
196 » » » OPJ_FLOAT32 v = c2[i]; | |
197 » » » OPJ_FLOAT32 r = y + (v * 1.402f); | |
198 » » » OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); | |
199 » » » OPJ_FLOAT32 b = y + (u * 1.772f); | |
200 » » » c0[i] = r; | |
201 » » » c1[i] = g; | |
202 » » » c2[i] = b; | |
203 » » } | |
204 » } | |
205 #endif | 190 #endif |
206 for(i = 0; i < n; ++i) { | 191 for(i = 0; i < n; ++i) { |
207 OPJ_FLOAT32 y = c0[i]; | 192 OPJ_FLOAT32 y = c0[i]; |
208 OPJ_FLOAT32 u = c1[i]; | 193 OPJ_FLOAT32 u = c1[i]; |
209 OPJ_FLOAT32 v = c2[i]; | 194 OPJ_FLOAT32 v = c2[i]; |
210 OPJ_FLOAT32 r = y + (v * 1.402f); | 195 OPJ_FLOAT32 r = y + (v * 1.402f); |
211 OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); | 196 OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f)); |
212 OPJ_FLOAT32 b = y + (u * 1.772f); | 197 OPJ_FLOAT32 b = y + (u * 1.772f); |
213 c0[i] = r; | 198 c0[i] = r; |
214 c1[i] = g; | 199 c1[i] = g; |
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
332 lIndex = i; | 317 lIndex = i; |
333 | 318 |
334 for (j=0;j<pNbComps;++j) { | 319 for (j=0;j<pNbComps;++j) { |
335 lCurrentValue = lMatrix[lIndex]; | 320 lCurrentValue = lMatrix[lIndex]; |
336 lIndex += pNbComps; | 321 lIndex += pNbComps; |
337 lNorms[i] += lCurrentValue * lCurrentValue; | 322 lNorms[i] += lCurrentValue * lCurrentValue; |
338 } | 323 } |
339 lNorms[i] = sqrt(lNorms[i]); | 324 lNorms[i] = sqrt(lNorms[i]); |
340 } | 325 } |
341 } | 326 } |
OLD | NEW |