OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 199 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
210 mova [qcoeffq+ncoeffq*2+16], m7 | 210 mova [qcoeffq+ncoeffq*2+16], m7 |
211 add ncoeffq, mmsize | 211 add ncoeffq, mmsize |
212 jl .blank_loop | 212 jl .blank_loop |
213 mov word [eobq], 0 | 213 mov word [eobq], 0 |
214 RET | 214 RET |
215 %endmacro | 215 %endmacro |
216 | 216 |
217 INIT_XMM ssse3 | 217 INIT_XMM ssse3 |
218 QUANTIZE_FN b, 7 | 218 QUANTIZE_FN b, 7 |
219 QUANTIZE_FN b_32x32, 7 | 219 QUANTIZE_FN b_32x32, 7 |
| 220 |
| 221 %macro QUANTIZE_FP 2 |
| 222 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ |
| 223 shift, qcoeff, dqcoeff, dequant, zbin_oq, \ |
| 224 eob, scan, iscan |
| 225 cmp dword skipm, 0 |
| 226 jne .blank |
| 227 |
| 228 ; actual quantize loop - setup pointers, rounders, etc. |
| 229 movifnidn coeffq, coeffmp |
| 230 movifnidn ncoeffq, ncoeffmp |
| 231 mov r2, dequantmp |
| 232 movifnidn zbinq, zbinmp |
| 233 movifnidn roundq, roundmp |
| 234 movifnidn quantq, quantmp |
| 235 mova m1, [roundq] ; m1 = round |
| 236 mova m2, [quantq] ; m2 = quant |
| 237 %ifidn %1, b_32x32 |
| 238 ; TODO(jingning) to be continued with 32x32 quantization process |
| 239 pcmpeqw m5, m5 |
| 240 psrlw m5, 15 |
| 241 paddw m0, m5 |
| 242 paddw m1, m5 |
| 243 psrlw m0, 1 ; m0 = (m0 + 1) / 2 |
| 244 psrlw m1, 1 ; m1 = (m1 + 1) / 2 |
| 245 %endif |
| 246 mova m3, [r2q] ; m3 = dequant |
| 247 mov r3, qcoeffmp |
| 248 mov r4, dqcoeffmp |
| 249 mov r5, iscanmp |
| 250 %ifidn %1, b_32x32 |
| 251 psllw m4, 1 |
| 252 %endif |
| 253 pxor m5, m5 ; m5 = dedicated zero |
| 254 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob |
| 255 lea coeffq, [ coeffq+ncoeffq*2] |
| 256 lea iscanq, [ iscanq+ncoeffq*2] |
| 257 lea qcoeffq, [ qcoeffq+ncoeffq*2] |
| 258 lea dqcoeffq, [dqcoeffq+ncoeffq*2] |
| 259 neg ncoeffq |
| 260 |
| 261 ; get DC and first 15 AC coeffs |
| 262 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
| 263 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
| 264 pabsw m6, m9 ; m6 = abs(m9) |
| 265 pabsw m11, m10 ; m11 = abs(m10) |
| 266 pcmpeqw m7, m7 |
| 267 |
| 268 paddsw m6, m1 ; m6 += round |
| 269 punpckhqdq m1, m1 |
| 270 paddsw m11, m1 ; m11 += round |
| 271 pmulhw m8, m6, m2 ; m8 = m6*q>>16 |
| 272 punpckhqdq m2, m2 |
| 273 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
| 274 psignw m8, m9 ; m8 = reinsert sign |
| 275 psignw m13, m10 ; m13 = reinsert sign |
| 276 mova [qcoeffq+ncoeffq*2+ 0], m8 |
| 277 mova [qcoeffq+ncoeffq*2+16], m13 |
| 278 %ifidn %1, b_32x32 |
| 279 pabsw m8, m8 |
| 280 pabsw m13, m13 |
| 281 %endif |
| 282 pmullw m8, m3 ; dqc[i] = qc[i] * q |
| 283 punpckhqdq m3, m3 |
| 284 pmullw m13, m3 ; dqc[i] = qc[i] * q |
| 285 %ifidn %1, b_32x32 |
| 286 psrlw m8, 1 |
| 287 psrlw m13, 1 |
| 288 psignw m8, m9 |
| 289 psignw m13, m10 |
| 290 %endif |
| 291 mova [dqcoeffq+ncoeffq*2+ 0], m8 |
| 292 mova [dqcoeffq+ncoeffq*2+16], m13 |
| 293 pcmpeqw m8, m5 ; m8 = c[i] == 0 |
| 294 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
| 295 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] |
| 296 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] |
| 297 psubw m6, m7 ; m6 = scan[i] + 1 |
| 298 psubw m11, m7 ; m11 = scan[i] + 1 |
| 299 pandn m8, m6 ; m8 = max(eob) |
| 300 pandn m13, m11 ; m13 = max(eob) |
| 301 pmaxsw m8, m13 |
| 302 add ncoeffq, mmsize |
| 303 jz .accumulate_eob |
| 304 |
| 305 .ac_only_loop: |
| 306 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] |
| 307 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] |
| 308 pabsw m6, m9 ; m6 = abs(m9) |
| 309 pabsw m11, m10 ; m11 = abs(m10) |
| 310 pcmpeqw m7, m7 |
| 311 %ifidn %1, b_32x32 |
| 312 pmovmskb r6, m7 |
| 313 pmovmskb r2, m7 |
| 314 or r6, r2 |
| 315 jz .skip_iter |
| 316 %endif |
| 317 paddsw m6, m1 ; m6 += round |
| 318 paddsw m11, m1 ; m11 += round |
| 319 pmulhw m14, m6, m2 ; m14 = m6*q>>16 |
| 320 pmulhw m13, m11, m2 ; m13 = m11*q>>16 |
| 321 psignw m14, m9 ; m14 = reinsert sign |
| 322 psignw m13, m10 ; m13 = reinsert sign |
| 323 mova [qcoeffq+ncoeffq*2+ 0], m14 |
| 324 mova [qcoeffq+ncoeffq*2+16], m13 |
| 325 %ifidn %1, b_32x32 |
| 326 pabsw m14, m14 |
| 327 pabsw m13, m13 |
| 328 %endif |
| 329 pmullw m14, m3 ; dqc[i] = qc[i] * q |
| 330 pmullw m13, m3 ; dqc[i] = qc[i] * q |
| 331 %ifidn %1, b_32x32 |
| 332 psrlw m14, 1 |
| 333 psrlw m13, 1 |
| 334 psignw m14, m9 |
| 335 psignw m13, m10 |
| 336 %endif |
| 337 mova [dqcoeffq+ncoeffq*2+ 0], m14 |
| 338 mova [dqcoeffq+ncoeffq*2+16], m13 |
| 339 pcmpeqw m14, m5 ; m14 = c[i] == 0 |
| 340 pcmpeqw m13, m5 ; m13 = c[i] == 0 |
| 341 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] |
| 342 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] |
| 343 psubw m6, m7 ; m6 = scan[i] + 1 |
| 344 psubw m11, m7 ; m11 = scan[i] + 1 |
| 345 pandn m14, m6 ; m14 = max(eob) |
| 346 pandn m13, m11 ; m13 = max(eob) |
| 347 pmaxsw m8, m14 |
| 348 pmaxsw m8, m13 |
| 349 add ncoeffq, mmsize |
| 350 jl .ac_only_loop |
| 351 |
| 352 %ifidn %1, b_32x32 |
| 353 jmp .accumulate_eob |
| 354 .skip_iter: |
| 355 mova [qcoeffq+ncoeffq*2+ 0], m5 |
| 356 mova [qcoeffq+ncoeffq*2+16], m5 |
| 357 mova [dqcoeffq+ncoeffq*2+ 0], m5 |
| 358 mova [dqcoeffq+ncoeffq*2+16], m5 |
| 359 add ncoeffq, mmsize |
| 360 jl .ac_only_loop |
| 361 %endif |
| 362 |
| 363 .accumulate_eob: |
| 364 ; horizontally accumulate/max eobs and write into [eob] memory pointer |
| 365 mov r2, eobmp |
| 366 pshufd m7, m8, 0xe |
| 367 pmaxsw m8, m7 |
| 368 pshuflw m7, m8, 0xe |
| 369 pmaxsw m8, m7 |
| 370 pshuflw m7, m8, 0x1 |
| 371 pmaxsw m8, m7 |
| 372 pextrw r6, m8, 0 |
| 373 mov [r2], r6 |
| 374 RET |
| 375 |
| 376 ; skip-block, i.e. just write all zeroes |
| 377 .blank: |
| 378 mov r0, dqcoeffmp |
| 379 movifnidn ncoeffq, ncoeffmp |
| 380 mov r2, qcoeffmp |
| 381 mov r3, eobmp |
| 382 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob |
| 383 lea dqcoeffq, [dqcoeffq+ncoeffq*2] |
| 384 lea qcoeffq, [ qcoeffq+ncoeffq*2] |
| 385 neg ncoeffq |
| 386 pxor m7, m7 |
| 387 .blank_loop: |
| 388 mova [dqcoeffq+ncoeffq*2+ 0], m7 |
| 389 mova [dqcoeffq+ncoeffq*2+16], m7 |
| 390 mova [qcoeffq+ncoeffq*2+ 0], m7 |
| 391 mova [qcoeffq+ncoeffq*2+16], m7 |
| 392 add ncoeffq, mmsize |
| 393 jl .blank_loop |
| 394 mov word [eobq], 0 |
| 395 RET |
| 396 %endmacro |
| 397 |
| 398 INIT_XMM ssse3 |
| 399 QUANTIZE_FP fp, 7 |
OLD | NEW |