| OLD | NEW |
| 1 /* Copyright (c) 2007-2008 CSIRO | 1 /* Copyright (c) 2007-2008 CSIRO |
| 2 Copyright (c) 2007-2009 Xiph.Org Foundation | 2 Copyright (c) 2007-2009 Xiph.Org Foundation |
| 3 Written by Jean-Marc Valin */ | 3 Written by Jean-Marc Valin */ |
| 4 /* | 4 /* |
| 5 Redistribution and use in source and binary forms, with or without | 5 Redistribution and use in source and binary forms, with or without |
| 6 modification, are permitted provided that the following conditions | 6 modification, are permitted provided that the following conditions |
| 7 are met: | 7 are met: |
| 8 | 8 |
| 9 - Redistributions of source code must retain the above copyright | 9 - Redistributions of source code must retain the above copyright |
| 10 notice, this list of conditions and the following disclaimer. | 10 notice, this list of conditions and the following disclaimer. |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 60 { | 60 { |
| 61 celt_norm x1, x2; | 61 celt_norm x1, x2; |
| 62 x1 = Xptr[0]; | 62 x1 = Xptr[0]; |
| 63 x2 = Xptr[stride]; | 63 x2 = Xptr[stride]; |
| 64 Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15)); | 64 Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15)); |
| 65 *Xptr-- = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15)); | 65 *Xptr-- = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15)); |
| 66 } | 66 } |
| 67 } | 67 } |
| 68 #endif /* OVERRIDE_vq_exp_rotation1 */ | 68 #endif /* OVERRIDE_vq_exp_rotation1 */ |
| 69 | 69 |
| 70 static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int
spread) | 70 void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread) |
| 71 { | 71 { |
| 72 static const int SPREAD_FACTOR[3]={15,10,5}; | 72 static const int SPREAD_FACTOR[3]={15,10,5}; |
| 73 int i; | 73 int i; |
| 74 opus_val16 c, s; | 74 opus_val16 c, s; |
| 75 opus_val16 gain, theta; | 75 opus_val16 gain, theta; |
| 76 int stride2=0; | 76 int stride2=0; |
| 77 int factor; | 77 int factor; |
| 78 | 78 |
| 79 if (2*K>=len || spread==SPREAD_NONE) | 79 if (2*K>=len || spread==SPREAD_NONE) |
| 80 return; | 80 return; |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 151 int j; | 151 int j; |
| 152 unsigned tmp=0; | 152 unsigned tmp=0; |
| 153 j=0; do { | 153 j=0; do { |
| 154 tmp |= iy[i*N0+j]; | 154 tmp |= iy[i*N0+j]; |
| 155 } while (++j<N0); | 155 } while (++j<N0); |
| 156 collapse_mask |= (tmp!=0)<<i; | 156 collapse_mask |= (tmp!=0)<<i; |
| 157 } while (++i<B); | 157 } while (++i<B); |
| 158 return collapse_mask; | 158 return collapse_mask; |
| 159 } | 159 } |
| 160 | 160 |
| 161 unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc | 161 opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch) |
| 162 #ifdef RESYNTH | |
| 163 , opus_val16 gain | |
| 164 #endif | |
| 165 ) | |
| 166 { | 162 { |
| 167 VARDECL(celt_norm, y); | 163 VARDECL(celt_norm, y); |
| 168 VARDECL(int, iy); | 164 VARDECL(int, signx); |
| 169 VARDECL(opus_val16, signx); | |
| 170 int i, j; | 165 int i, j; |
| 171 opus_val16 s; | |
| 172 int pulsesLeft; | 166 int pulsesLeft; |
| 173 opus_val32 sum; | 167 opus_val32 sum; |
| 174 opus_val32 xy; | 168 opus_val32 xy; |
| 175 opus_val16 yy; | 169 opus_val16 yy; |
| 176 unsigned collapse_mask; | |
| 177 SAVE_STACK; | 170 SAVE_STACK; |
| 178 | 171 |
| 179 celt_assert2(K>0, "alg_quant() needs at least one pulse"); | 172 (void)arch; |
| 180 celt_assert2(N>1, "alg_quant() needs at least two dimensions"); | |
| 181 | |
| 182 ALLOC(y, N, celt_norm); | 173 ALLOC(y, N, celt_norm); |
| 183 ALLOC(iy, N, int); | 174 ALLOC(signx, N, int); |
| 184 ALLOC(signx, N, opus_val16); | |
| 185 | |
| 186 exp_rotation(X, N, 1, B, K, spread); | |
| 187 | 175 |
| 188 /* Get rid of the sign */ | 176 /* Get rid of the sign */ |
| 189 sum = 0; | 177 sum = 0; |
| 190 j=0; do { | 178 j=0; do { |
| 191 if (X[j]>0) | 179 signx[j] = X[j]<0; |
| 192 signx[j]=1; | 180 /* OPT: Make sure the compiler doesn't use a branch on ABS16(). */ |
| 193 else { | 181 X[j] = ABS16(X[j]); |
| 194 signx[j]=-1; | |
| 195 X[j]=-X[j]; | |
| 196 } | |
| 197 iy[j] = 0; | 182 iy[j] = 0; |
| 198 y[j] = 0; | 183 y[j] = 0; |
| 199 } while (++j<N); | 184 } while (++j<N); |
| 200 | 185 |
| 201 xy = yy = 0; | 186 xy = yy = 0; |
| 202 | 187 |
| 203 pulsesLeft = K; | 188 pulsesLeft = K; |
| 204 | 189 |
| 205 /* Do a pre-search by projecting on the pyramid */ | 190 /* Do a pre-search by projecting on the pyramid */ |
| 206 if (K > (N>>1)) | 191 if (K > (N>>1)) |
| (...skipping 11 matching lines...) Expand all Loading... |
| 218 to be allocated. 64 is an approximation of infinity here. */ | 203 to be allocated. 64 is an approximation of infinity here. */ |
| 219 if (!(sum > EPSILON && sum < 64)) | 204 if (!(sum > EPSILON && sum < 64)) |
| 220 #endif | 205 #endif |
| 221 { | 206 { |
| 222 X[0] = QCONST16(1.f,14); | 207 X[0] = QCONST16(1.f,14); |
| 223 j=1; do | 208 j=1; do |
| 224 X[j]=0; | 209 X[j]=0; |
| 225 while (++j<N); | 210 while (++j<N); |
| 226 sum = QCONST16(1.f,14); | 211 sum = QCONST16(1.f,14); |
| 227 } | 212 } |
| 228 rcp = EXTRACT16(MULT16_32_Q16(K-1, celt_rcp(sum))); | 213 #ifdef FIXED_POINT |
| 214 rcp = EXTRACT16(MULT16_32_Q16(K, celt_rcp(sum))); |
| 215 #else |
| 216 /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */ |
| 217 rcp = EXTRACT16(MULT16_32_Q16(K+0.8f, celt_rcp(sum))); |
| 218 #endif |
| 229 j=0; do { | 219 j=0; do { |
| 230 #ifdef FIXED_POINT | 220 #ifdef FIXED_POINT |
| 231 /* It's really important to round *towards zero* here */ | 221 /* It's really important to round *towards zero* here */ |
| 232 iy[j] = MULT16_16_Q15(X[j],rcp); | 222 iy[j] = MULT16_16_Q15(X[j],rcp); |
| 233 #else | 223 #else |
| 234 iy[j] = (int)floor(rcp*X[j]); | 224 iy[j] = (int)floor(rcp*X[j]); |
| 235 #endif | 225 #endif |
| 236 y[j] = (celt_norm)iy[j]; | 226 y[j] = (celt_norm)iy[j]; |
| 237 yy = MAC16_16(yy, y[j],y[j]); | 227 yy = MAC16_16(yy, y[j],y[j]); |
| 238 xy = MAC16_16(xy, X[j],y[j]); | 228 xy = MAC16_16(xy, X[j],y[j]); |
| 239 y[j] *= 2; | 229 y[j] *= 2; |
| 240 pulsesLeft -= iy[j]; | 230 pulsesLeft -= iy[j]; |
| 241 } while (++j<N); | 231 } while (++j<N); |
| 242 } | 232 } |
| 243 celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass"); | 233 celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass"); |
| 244 | 234 |
| 245 /* This should never happen, but just in case it does (e.g. on silence) | 235 /* This should never happen, but just in case it does (e.g. on silence) |
| 246 we fill the first bin with pulses. */ | 236 we fill the first bin with pulses. */ |
| 247 #ifdef FIXED_POINT_DEBUG | 237 #ifdef FIXED_POINT_DEBUG |
| 248 celt_assert2(pulsesLeft<=N+3, "Not enough pulses in the quick pass"); | 238 celt_assert2(pulsesLeft<=N+3, "Not enough pulses in the quick pass"); |
| 249 #endif | 239 #endif |
| 250 if (pulsesLeft > N+3) | 240 if (pulsesLeft > N+3) |
| 251 { | 241 { |
| 252 opus_val16 tmp = (opus_val16)pulsesLeft; | 242 opus_val16 tmp = (opus_val16)pulsesLeft; |
| 253 yy = MAC16_16(yy, tmp, tmp); | 243 yy = MAC16_16(yy, tmp, tmp); |
| 254 yy = MAC16_16(yy, tmp, y[0]); | 244 yy = MAC16_16(yy, tmp, y[0]); |
| 255 iy[0] += pulsesLeft; | 245 iy[0] += pulsesLeft; |
| 256 pulsesLeft=0; | 246 pulsesLeft=0; |
| 257 } | 247 } |
| 258 | 248 |
| 259 s = 1; | |
| 260 for (i=0;i<pulsesLeft;i++) | 249 for (i=0;i<pulsesLeft;i++) |
| 261 { | 250 { |
| 251 opus_val16 Rxy, Ryy; |
| 262 int best_id; | 252 int best_id; |
| 263 opus_val32 best_num = -VERY_LARGE16; | 253 opus_val32 best_num; |
| 264 opus_val16 best_den = 0; | 254 opus_val16 best_den; |
| 265 #ifdef FIXED_POINT | 255 #ifdef FIXED_POINT |
| 266 int rshift; | 256 int rshift; |
| 267 #endif | 257 #endif |
| 268 #ifdef FIXED_POINT | 258 #ifdef FIXED_POINT |
| 269 rshift = 1+celt_ilog2(K-pulsesLeft+i+1); | 259 rshift = 1+celt_ilog2(K-pulsesLeft+i+1); |
| 270 #endif | 260 #endif |
| 271 best_id = 0; | 261 best_id = 0; |
| 272 /* The squared magnitude term gets added anyway, so we might as well | 262 /* The squared magnitude term gets added anyway, so we might as well |
| 273 add it outside the loop */ | 263 add it outside the loop */ |
| 274 yy = ADD16(yy, 1); | 264 yy = ADD16(yy, 1); |
| 275 j=0; | 265 |
| 266 /* Calculations for position 0 are out of the loop, in part to reduce |
| 267 mispredicted branches (since the if condition is usually false) |
| 268 in the loop. */ |
| 269 /* Temporary sums of the new pulse(s) */ |
| 270 Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[0])),rshift)); |
| 271 /* We're multiplying y[j] by two so we don't have to do it here */ |
| 272 Ryy = ADD16(yy, y[0]); |
| 273 |
| 274 /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that |
| 275 Rxy is positive because the sign is pre-computed) */ |
| 276 Rxy = MULT16_16_Q15(Rxy,Rxy); |
| 277 best_den = Ryy; |
| 278 best_num = Rxy; |
| 279 j=1; |
| 276 do { | 280 do { |
| 277 opus_val16 Rxy, Ryy; | |
| 278 /* Temporary sums of the new pulse(s) */ | 281 /* Temporary sums of the new pulse(s) */ |
| 279 Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift)); | 282 Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift)); |
| 280 /* We're multiplying y[j] by two so we don't have to do it here */ | 283 /* We're multiplying y[j] by two so we don't have to do it here */ |
| 281 Ryy = ADD16(yy, y[j]); | 284 Ryy = ADD16(yy, y[j]); |
| 282 | 285 |
| 283 /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that | 286 /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that |
| 284 Rxy is positive because the sign is pre-computed) */ | 287 Rxy is positive because the sign is pre-computed) */ |
| 285 Rxy = MULT16_16_Q15(Rxy,Rxy); | 288 Rxy = MULT16_16_Q15(Rxy,Rxy); |
| 286 /* The idea is to check for num/den >= best_num/best_den, but that way | 289 /* The idea is to check for num/den >= best_num/best_den, but that way |
| 287 we can do it without any division */ | 290 we can do it without any division */ |
| 288 /* OPT: Make sure to use conditional moves here */ | 291 /* OPT: It's not clear whether a cmov is faster than a branch here |
| 289 if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num)) | 292 since the condition is more often false than true and using |
| 293 a cmov introduces data dependencies across iterations. The optimal |
| 294 choice may be architecture-dependent. */ |
| 295 if (opus_unlikely(MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))) |
| 290 { | 296 { |
| 291 best_den = Ryy; | 297 best_den = Ryy; |
| 292 best_num = Rxy; | 298 best_num = Rxy; |
| 293 best_id = j; | 299 best_id = j; |
| 294 } | 300 } |
| 295 } while (++j<N); | 301 } while (++j<N); |
| 296 | 302 |
| 297 /* Updating the sums of the new pulse(s) */ | 303 /* Updating the sums of the new pulse(s) */ |
| 298 xy = ADD32(xy, EXTEND32(X[best_id])); | 304 xy = ADD32(xy, EXTEND32(X[best_id])); |
| 299 /* We're multiplying y[j] by two so we don't have to do it here */ | 305 /* We're multiplying y[j] by two so we don't have to do it here */ |
| 300 yy = ADD16(yy, y[best_id]); | 306 yy = ADD16(yy, y[best_id]); |
| 301 | 307 |
| 302 /* Only now that we've made the final choice, update y/iy */ | 308 /* Only now that we've made the final choice, update y/iy */ |
| 303 /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ | 309 /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ |
| 304 y[best_id] += 2*s; | 310 y[best_id] += 2; |
| 305 iy[best_id]++; | 311 iy[best_id]++; |
| 306 } | 312 } |
| 307 | 313 |
| 308 /* Put the original sign back */ | 314 /* Put the original sign back */ |
| 309 j=0; | 315 j=0; |
| 310 do { | 316 do { |
| 311 X[j] = MULT16_16(signx[j],X[j]); | 317 /*iy[j] = signx[j] ? -iy[j] : iy[j];*/ |
| 312 if (signx[j] < 0) | 318 /* OPT: The is more likely to be compiled without a branch than the code a
bove |
| 313 iy[j] = -iy[j]; | 319 but has the same performance otherwise. */ |
| 320 iy[j] = (iy[j]^-signx[j]) + signx[j]; |
| 314 } while (++j<N); | 321 } while (++j<N); |
| 322 RESTORE_STACK; |
| 323 return yy; |
| 324 } |
| 325 |
| 326 unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, |
| 327 opus_val16 gain, int resynth, int arch) |
| 328 { |
| 329 VARDECL(int, iy); |
| 330 opus_val16 yy; |
| 331 unsigned collapse_mask; |
| 332 SAVE_STACK; |
| 333 |
| 334 celt_assert2(K>0, "alg_quant() needs at least one pulse"); |
| 335 celt_assert2(N>1, "alg_quant() needs at least two dimensions"); |
| 336 |
| 337 /* Covers vectorization by up to 4. */ |
| 338 ALLOC(iy, N+3, int); |
| 339 |
| 340 exp_rotation(X, N, 1, B, K, spread); |
| 341 |
| 342 yy = op_pvq_search(X, iy, K, N, arch); |
| 343 |
| 315 encode_pulses(iy, N, K, enc); | 344 encode_pulses(iy, N, K, enc); |
| 316 | 345 |
| 317 #ifdef RESYNTH | 346 if (resynth) |
| 318 normalise_residual(iy, X, N, yy, gain); | 347 { |
| 319 exp_rotation(X, N, -1, B, K, spread); | 348 normalise_residual(iy, X, N, yy, gain); |
| 320 #endif | 349 exp_rotation(X, N, -1, B, K, spread); |
| 350 } |
| 321 | 351 |
| 322 collapse_mask = extract_collapse_mask(iy, N, B); | 352 collapse_mask = extract_collapse_mask(iy, N, B); |
| 323 RESTORE_STACK; | 353 RESTORE_STACK; |
| 324 return collapse_mask; | 354 return collapse_mask; |
| 325 } | 355 } |
| 326 | 356 |
| 327 /** Decode pulse vector and combine the result with the pitch vector to produce | 357 /** Decode pulse vector and combine the result with the pitch vector to produce |
| 328 the final normalised signal in the current band. */ | 358 the final normalised signal in the current band. */ |
| 329 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B, | 359 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B, |
| 330 ec_dec *dec, opus_val16 gain) | 360 ec_dec *dec, opus_val16 gain) |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 394 } else { | 424 } else { |
| 395 Emid += celt_inner_prod(X, X, N, arch); | 425 Emid += celt_inner_prod(X, X, N, arch); |
| 396 Eside += celt_inner_prod(Y, Y, N, arch); | 426 Eside += celt_inner_prod(Y, Y, N, arch); |
| 397 } | 427 } |
| 398 mid = celt_sqrt(Emid); | 428 mid = celt_sqrt(Emid); |
| 399 side = celt_sqrt(Eside); | 429 side = celt_sqrt(Eside); |
| 400 #ifdef FIXED_POINT | 430 #ifdef FIXED_POINT |
| 401 /* 0.63662 = 2/pi */ | 431 /* 0.63662 = 2/pi */ |
| 402 itheta = MULT16_16_Q15(QCONST16(0.63662f,15),celt_atan2p(side, mid)); | 432 itheta = MULT16_16_Q15(QCONST16(0.63662f,15),celt_atan2p(side, mid)); |
| 403 #else | 433 #else |
| 404 itheta = (int)floor(.5f+16384*0.63662f*atan2(side,mid)); | 434 itheta = (int)floor(.5f+16384*0.63662f*fast_atan2f(side,mid)); |
| 405 #endif | 435 #endif |
| 406 | 436 |
| 407 return itheta; | 437 return itheta; |
| 408 } | 438 } |
| OLD | NEW |