OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
227 | 227 |
228 ; actual quantize loop - setup pointers, rounders, etc. | 228 ; actual quantize loop - setup pointers, rounders, etc. |
229 movifnidn coeffq, coeffmp | 229 movifnidn coeffq, coeffmp |
230 movifnidn ncoeffq, ncoeffmp | 230 movifnidn ncoeffq, ncoeffmp |
231 mov r2, dequantmp | 231 mov r2, dequantmp |
232 movifnidn zbinq, zbinmp | 232 movifnidn zbinq, zbinmp |
233 movifnidn roundq, roundmp | 233 movifnidn roundq, roundmp |
234 movifnidn quantq, quantmp | 234 movifnidn quantq, quantmp |
235 mova m1, [roundq] ; m1 = round | 235 mova m1, [roundq] ; m1 = round |
236 mova m2, [quantq] ; m2 = quant | 236 mova m2, [quantq] ; m2 = quant |
237 %ifidn %1, b_32x32 | 237 %ifidn %1, fp_32x32 |
238 ; TODO(jingning) to be continued with 32x32 quantization process | |
239 pcmpeqw m5, m5 | 238 pcmpeqw m5, m5 |
240 psrlw m5, 15 | 239 psrlw m5, 15 |
241 paddw m0, m5 | |
242 paddw m1, m5 | 240 paddw m1, m5 |
243 psrlw m0, 1 ; m0 = (m0 + 1) / 2 | |
244 psrlw m1, 1 ; m1 = (m1 + 1) / 2 | 241 psrlw m1, 1 ; m1 = (m1 + 1) / 2 |
245 %endif | 242 %endif |
246 mova m3, [r2q] ; m3 = dequant | 243 mova m3, [r2q] ; m3 = dequant |
247 mov r3, qcoeffmp | 244 mov r3, qcoeffmp |
248 mov r4, dqcoeffmp | 245 mov r4, dqcoeffmp |
249 mov r5, iscanmp | 246 mov r5, iscanmp |
250 %ifidn %1, b_32x32 | 247 %ifidn %1, fp_32x32 |
251 psllw m4, 1 | 248 psllw m2, 1 |
252 %endif | 249 %endif |
253 pxor m5, m5 ; m5 = dedicated zero | 250 pxor m5, m5 ; m5 = dedicated zero |
254 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob | 251 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob |
255 lea coeffq, [ coeffq+ncoeffq*2] | 252 lea coeffq, [ coeffq+ncoeffq*2] |
256 lea iscanq, [ iscanq+ncoeffq*2] | 253 lea iscanq, [ iscanq+ncoeffq*2] |
257 lea qcoeffq, [ qcoeffq+ncoeffq*2] | 254 lea qcoeffq, [ qcoeffq+ncoeffq*2] |
258 lea dqcoeffq, [dqcoeffq+ncoeffq*2] | 255 lea dqcoeffq, [dqcoeffq+ncoeffq*2] |
259 neg ncoeffq | 256 neg ncoeffq |
260 | 257 |
261 ; get DC and first 15 AC coeffs | 258 ; get DC and first 15 AC coeffs |
262 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 259 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
263 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 260 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
264 pabsw m6, m9 ; m6 = abs(m9) | 261 pabsw m6, m9 ; m6 = abs(m9) |
265 pabsw m11, m10 ; m11 = abs(m10) | 262 pabsw m11, m10 ; m11 = abs(m10) |
266 pcmpeqw m7, m7 | 263 pcmpeqw m7, m7 |
267 | 264 |
268 paddsw m6, m1 ; m6 += round | 265 paddsw m6, m1 ; m6 += round |
269 punpckhqdq m1, m1 | 266 punpckhqdq m1, m1 |
270 paddsw m11, m1 ; m11 += round | 267 paddsw m11, m1 ; m11 += round |
271 pmulhw m8, m6, m2 ; m8 = m6*q>>16 | 268 pmulhw m8, m6, m2 ; m8 = m6*q>>16 |
272 punpckhqdq m2, m2 | 269 punpckhqdq m2, m2 |
273 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 270 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
274 psignw m8, m9 ; m8 = reinsert sign | 271 psignw m8, m9 ; m8 = reinsert sign |
275 psignw m13, m10 ; m13 = reinsert sign | 272 psignw m13, m10 ; m13 = reinsert sign |
276 mova [qcoeffq+ncoeffq*2+ 0], m8 | 273 mova [qcoeffq+ncoeffq*2+ 0], m8 |
277 mova [qcoeffq+ncoeffq*2+16], m13 | 274 mova [qcoeffq+ncoeffq*2+16], m13 |
278 %ifidn %1, b_32x32 | 275 %ifidn %1, fp_32x32 |
279 pabsw m8, m8 | 276 pabsw m8, m8 |
280 pabsw m13, m13 | 277 pabsw m13, m13 |
281 %endif | 278 %endif |
282 pmullw m8, m3 ; dqc[i] = qc[i] * q | 279 pmullw m8, m3 ; dqc[i] = qc[i] * q |
283 punpckhqdq m3, m3 | 280 punpckhqdq m3, m3 |
284 pmullw m13, m3 ; dqc[i] = qc[i] * q | 281 pmullw m13, m3 ; dqc[i] = qc[i] * q |
285 %ifidn %1, b_32x32 | 282 %ifidn %1, fp_32x32 |
286 psrlw m8, 1 | 283 psrlw m8, 1 |
287 psrlw m13, 1 | 284 psrlw m13, 1 |
288 psignw m8, m9 | 285 psignw m8, m9 |
289 psignw m13, m10 | 286 psignw m13, m10 |
| 287 psrlw m0, m3, 2 |
290 %endif | 288 %endif |
291 mova [dqcoeffq+ncoeffq*2+ 0], m8 | 289 mova [dqcoeffq+ncoeffq*2+ 0], m8 |
292 mova [dqcoeffq+ncoeffq*2+16], m13 | 290 mova [dqcoeffq+ncoeffq*2+16], m13 |
293 pcmpeqw m8, m5 ; m8 = c[i] == 0 | 291 pcmpeqw m8, m5 ; m8 = c[i] == 0 |
294 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 292 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
295 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 293 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] |
296 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 294 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] |
297 psubw m6, m7 ; m6 = scan[i] + 1 | 295 psubw m6, m7 ; m6 = scan[i] + 1 |
298 psubw m11, m7 ; m11 = scan[i] + 1 | 296 psubw m11, m7 ; m11 = scan[i] + 1 |
299 pandn m8, m6 ; m8 = max(eob) | 297 pandn m8, m6 ; m8 = max(eob) |
300 pandn m13, m11 ; m13 = max(eob) | 298 pandn m13, m11 ; m13 = max(eob) |
301 pmaxsw m8, m13 | 299 pmaxsw m8, m13 |
302 add ncoeffq, mmsize | 300 add ncoeffq, mmsize |
303 jz .accumulate_eob | 301 jz .accumulate_eob |
304 | 302 |
305 .ac_only_loop: | 303 .ac_only_loop: |
306 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] | 304 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
307 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] | 305 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
308 pabsw m6, m9 ; m6 = abs(m9) | 306 pabsw m6, m9 ; m6 = abs(m9) |
309 pabsw m11, m10 ; m11 = abs(m10) | 307 pabsw m11, m10 ; m11 = abs(m10) |
310 pcmpeqw m7, m7 | 308 %ifidn %1, fp_32x32 |
311 %ifidn %1, b_32x32 | 309 pcmpgtw m7, m6, m0 |
| 310 pcmpgtw m12, m11, m0 |
312 pmovmskb r6, m7 | 311 pmovmskb r6, m7 |
313 pmovmskb r2, m7 | 312 pmovmskb r2, m12 |
| 313 |
314 or r6, r2 | 314 or r6, r2 |
315 jz .skip_iter | 315 jz .skip_iter |
316 %endif | 316 %endif |
| 317 pcmpeqw m7, m7 |
| 318 |
317 paddsw m6, m1 ; m6 += round | 319 paddsw m6, m1 ; m6 += round |
318 paddsw m11, m1 ; m11 += round | 320 paddsw m11, m1 ; m11 += round |
319 pmulhw m14, m6, m2 ; m14 = m6*q>>16 | 321 pmulhw m14, m6, m2 ; m14 = m6*q>>16 |
320 pmulhw m13, m11, m2 ; m13 = m11*q>>16 | 322 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
321 psignw m14, m9 ; m14 = reinsert sign | 323 psignw m14, m9 ; m14 = reinsert sign |
322 psignw m13, m10 ; m13 = reinsert sign | 324 psignw m13, m10 ; m13 = reinsert sign |
323 mova [qcoeffq+ncoeffq*2+ 0], m14 | 325 mova [qcoeffq+ncoeffq*2+ 0], m14 |
324 mova [qcoeffq+ncoeffq*2+16], m13 | 326 mova [qcoeffq+ncoeffq*2+16], m13 |
325 %ifidn %1, b_32x32 | 327 %ifidn %1, fp_32x32 |
326 pabsw m14, m14 | 328 pabsw m14, m14 |
327 pabsw m13, m13 | 329 pabsw m13, m13 |
328 %endif | 330 %endif |
329 pmullw m14, m3 ; dqc[i] = qc[i] * q | 331 pmullw m14, m3 ; dqc[i] = qc[i] * q |
330 pmullw m13, m3 ; dqc[i] = qc[i] * q | 332 pmullw m13, m3 ; dqc[i] = qc[i] * q |
331 %ifidn %1, b_32x32 | 333 %ifidn %1, fp_32x32 |
332 psrlw m14, 1 | 334 psrlw m14, 1 |
333 psrlw m13, 1 | 335 psrlw m13, 1 |
334 psignw m14, m9 | 336 psignw m14, m9 |
335 psignw m13, m10 | 337 psignw m13, m10 |
336 %endif | 338 %endif |
337 mova [dqcoeffq+ncoeffq*2+ 0], m14 | 339 mova [dqcoeffq+ncoeffq*2+ 0], m14 |
338 mova [dqcoeffq+ncoeffq*2+16], m13 | 340 mova [dqcoeffq+ncoeffq*2+16], m13 |
339 pcmpeqw m14, m5 ; m14 = c[i] == 0 | 341 pcmpeqw m14, m5 ; m14 = c[i] == 0 |
340 pcmpeqw m13, m5 ; m13 = c[i] == 0 | 342 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
341 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] | 343 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] |
342 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] | 344 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] |
343 psubw m6, m7 ; m6 = scan[i] + 1 | 345 psubw m6, m7 ; m6 = scan[i] + 1 |
344 psubw m11, m7 ; m11 = scan[i] + 1 | 346 psubw m11, m7 ; m11 = scan[i] + 1 |
345 pandn m14, m6 ; m14 = max(eob) | 347 pandn m14, m6 ; m14 = max(eob) |
346 pandn m13, m11 ; m13 = max(eob) | 348 pandn m13, m11 ; m13 = max(eob) |
347 pmaxsw m8, m14 | 349 pmaxsw m8, m14 |
348 pmaxsw m8, m13 | 350 pmaxsw m8, m13 |
349 add ncoeffq, mmsize | 351 add ncoeffq, mmsize |
350 jl .ac_only_loop | 352 jl .ac_only_loop |
351 | 353 |
352 %ifidn %1, b_32x32 | 354 %ifidn %1, fp_32x32 |
353 jmp .accumulate_eob | 355 jmp .accumulate_eob |
354 .skip_iter: | 356 .skip_iter: |
355 mova [qcoeffq+ncoeffq*2+ 0], m5 | 357 mova [qcoeffq+ncoeffq*2+ 0], m5 |
356 mova [qcoeffq+ncoeffq*2+16], m5 | 358 mova [qcoeffq+ncoeffq*2+16], m5 |
357 mova [dqcoeffq+ncoeffq*2+ 0], m5 | 359 mova [dqcoeffq+ncoeffq*2+ 0], m5 |
358 mova [dqcoeffq+ncoeffq*2+16], m5 | 360 mova [dqcoeffq+ncoeffq*2+16], m5 |
359 add ncoeffq, mmsize | 361 add ncoeffq, mmsize |
360 jl .ac_only_loop | 362 jl .ac_only_loop |
361 %endif | 363 %endif |
362 | 364 |
(...skipping 27 matching lines...) Expand all Loading... |
390 mova [qcoeffq+ncoeffq*2+ 0], m7 | 392 mova [qcoeffq+ncoeffq*2+ 0], m7 |
391 mova [qcoeffq+ncoeffq*2+16], m7 | 393 mova [qcoeffq+ncoeffq*2+16], m7 |
392 add ncoeffq, mmsize | 394 add ncoeffq, mmsize |
393 jl .blank_loop | 395 jl .blank_loop |
394 mov word [eobq], 0 | 396 mov word [eobq], 0 |
395 RET | 397 RET |
396 %endmacro | 398 %endmacro |
397 | 399 |
398 INIT_XMM ssse3 | 400 INIT_XMM ssse3 |
399 QUANTIZE_FP fp, 7 | 401 QUANTIZE_FP fp, 7 |
| 402 QUANTIZE_FP fp_32x32, 7 |
OLD | NEW |