Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(112)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

Issue 958693004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 %include "third_party/x86inc/x86inc.asm" 11 %include "third_party/x86inc/x86inc.asm"
12 12
13 SECTION_RODATA 13 SECTION_RODATA
14 pw_1: times 8 dw 1 14 pw_1: times 8 dw 1
15 15
16 SECTION .text 16 SECTION .text
17 17
18 ; TODO(yunqingwang)fix quantize_b code for skip=1 case.
18 %macro QUANTIZE_FN 2 19 %macro QUANTIZE_FN 2
19 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 20 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
20 shift, qcoeff, dqcoeff, dequant, \ 21 shift, qcoeff, dqcoeff, dequant, \
21 eob, scan, iscan 22 eob, scan, iscan
22 cmp dword skipm, 0 23 cmp dword skipm, 0
23 jne .blank 24 jne .blank
24 25
25 ; actual quantize loop - setup pointers, rounders, etc. 26 ; actual quantize loop - setup pointers, rounders, etc.
26 movifnidn coeffq, coeffmp 27 movifnidn coeffq, coeffmp
27 movifnidn ncoeffq, ncoeffmp 28 movifnidn ncoeffq, ncoeffmp
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after
237 psrlw m1, 1 ; m1 = (m1 + 1) / 2 238 psrlw m1, 1 ; m1 = (m1 + 1) / 2
238 %endif 239 %endif
239 mova m3, [r2q] ; m3 = dequant 240 mova m3, [r2q] ; m3 = dequant
240 mov r3, qcoeffmp 241 mov r3, qcoeffmp
241 mov r4, dqcoeffmp 242 mov r4, dqcoeffmp
242 mov r5, iscanmp 243 mov r5, iscanmp
243 %ifidn %1, fp_32x32 244 %ifidn %1, fp_32x32
244 psllw m2, 1 245 psllw m2, 1
245 %endif 246 %endif
246 pxor m5, m5 ; m5 = dedicated zero 247 pxor m5, m5 ; m5 = dedicated zero
247 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob 248
248 lea coeffq, [ coeffq+ncoeffq*2] 249 lea coeffq, [ coeffq+ncoeffq*2]
249 lea iscanq, [ iscanq+ncoeffq*2] 250 lea r5q, [ r5q+ncoeffq*2]
250 lea qcoeffq, [ qcoeffq+ncoeffq*2] 251 lea r3q, [ r3q+ncoeffq*2]
251 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 252 lea r4q, [r4q+ncoeffq*2]
252 neg ncoeffq 253 neg ncoeffq
253 254
254 ; get DC and first 15 AC coeffs 255 ; get DC and first 15 AC coeffs
255 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 256 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
256 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 257 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
257 pabsw m6, m9 ; m6 = abs(m9) 258 pabsw m6, m9 ; m6 = abs(m9)
258 pabsw m11, m10 ; m11 = abs(m10) 259 pabsw m11, m10 ; m11 = abs(m10)
259 pcmpeqw m7, m7 260 pcmpeqw m7, m7
260 261
261 paddsw m6, m1 ; m6 += round 262 paddsw m6, m1 ; m6 += round
262 punpckhqdq m1, m1 263 punpckhqdq m1, m1
263 paddsw m11, m1 ; m11 += round 264 paddsw m11, m1 ; m11 += round
264 pmulhw m8, m6, m2 ; m8 = m6*q>>16 265 pmulhw m8, m6, m2 ; m8 = m6*q>>16
265 punpckhqdq m2, m2 266 punpckhqdq m2, m2
266 pmulhw m13, m11, m2 ; m13 = m11*q>>16 267 pmulhw m13, m11, m2 ; m13 = m11*q>>16
267 psignw m8, m9 ; m8 = reinsert sign 268 psignw m8, m9 ; m8 = reinsert sign
268 psignw m13, m10 ; m13 = reinsert sign 269 psignw m13, m10 ; m13 = reinsert sign
269 mova [qcoeffq+ncoeffq*2+ 0], m8 270 mova [r3q+ncoeffq*2+ 0], m8
270 mova [qcoeffq+ncoeffq*2+16], m13 271 mova [r3q+ncoeffq*2+16], m13
271 %ifidn %1, fp_32x32 272 %ifidn %1, fp_32x32
272 pabsw m8, m8 273 pabsw m8, m8
273 pabsw m13, m13 274 pabsw m13, m13
274 %endif 275 %endif
275 pmullw m8, m3 ; dqc[i] = qc[i] * q 276 pmullw m8, m3 ; r4[i] = r3[i] * q
276 punpckhqdq m3, m3 277 punpckhqdq m3, m3
277 pmullw m13, m3 ; dqc[i] = qc[i] * q 278 pmullw m13, m3 ; r4[i] = r3[i] * q
278 %ifidn %1, fp_32x32 279 %ifidn %1, fp_32x32
279 psrlw m8, 1 280 psrlw m8, 1
280 psrlw m13, 1 281 psrlw m13, 1
281 psignw m8, m9 282 psignw m8, m9
282 psignw m13, m10 283 psignw m13, m10
283 psrlw m0, m3, 2 284 psrlw m0, m3, 2
284 %endif 285 %endif
285 mova [dqcoeffq+ncoeffq*2+ 0], m8 286 mova [r4q+ncoeffq*2+ 0], m8
286 mova [dqcoeffq+ncoeffq*2+16], m13 287 mova [r4q+ncoeffq*2+16], m13
287 pcmpeqw m8, m5 ; m8 = c[i] == 0 288 pcmpeqw m8, m5 ; m8 = c[i] == 0
288 pcmpeqw m13, m5 ; m13 = c[i] == 0 289 pcmpeqw m13, m5 ; m13 = c[i] == 0
289 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 290 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
290 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 291 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
291 psubw m6, m7 ; m6 = scan[i] + 1 292 psubw m6, m7 ; m6 = scan[i] + 1
292 psubw m11, m7 ; m11 = scan[i] + 1 293 psubw m11, m7 ; m11 = scan[i] + 1
293 pandn m8, m6 ; m8 = max(eob) 294 pandn m8, m6 ; m8 = max(eob)
294 pandn m13, m11 ; m13 = max(eob) 295 pandn m13, m11 ; m13 = max(eob)
295 pmaxsw m8, m13 296 pmaxsw m8, m13
296 add ncoeffq, mmsize 297 add ncoeffq, mmsize
297 jz .accumulate_eob 298 jz .accumulate_eob
298 299
299 .ac_only_loop: 300 .ac_only_loop:
300 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 301 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
(...skipping 10 matching lines...) Expand all
311 jz .skip_iter 312 jz .skip_iter
312 %endif 313 %endif
313 pcmpeqw m7, m7 314 pcmpeqw m7, m7
314 315
315 paddsw m6, m1 ; m6 += round 316 paddsw m6, m1 ; m6 += round
316 paddsw m11, m1 ; m11 += round 317 paddsw m11, m1 ; m11 += round
317 pmulhw m14, m6, m2 ; m14 = m6*q>>16 318 pmulhw m14, m6, m2 ; m14 = m6*q>>16
318 pmulhw m13, m11, m2 ; m13 = m11*q>>16 319 pmulhw m13, m11, m2 ; m13 = m11*q>>16
319 psignw m14, m9 ; m14 = reinsert sign 320 psignw m14, m9 ; m14 = reinsert sign
320 psignw m13, m10 ; m13 = reinsert sign 321 psignw m13, m10 ; m13 = reinsert sign
321 mova [qcoeffq+ncoeffq*2+ 0], m14 322 mova [r3q+ncoeffq*2+ 0], m14
322 mova [qcoeffq+ncoeffq*2+16], m13 323 mova [r3q+ncoeffq*2+16], m13
323 %ifidn %1, fp_32x32 324 %ifidn %1, fp_32x32
324 pabsw m14, m14 325 pabsw m14, m14
325 pabsw m13, m13 326 pabsw m13, m13
326 %endif 327 %endif
327 pmullw m14, m3 ; dqc[i] = qc[i] * q 328 pmullw m14, m3 ; r4[i] = r3[i] * q
328 pmullw m13, m3 ; dqc[i] = qc[i] * q 329 pmullw m13, m3 ; r4[i] = r3[i] * q
329 %ifidn %1, fp_32x32 330 %ifidn %1, fp_32x32
330 psrlw m14, 1 331 psrlw m14, 1
331 psrlw m13, 1 332 psrlw m13, 1
332 psignw m14, m9 333 psignw m14, m9
333 psignw m13, m10 334 psignw m13, m10
334 %endif 335 %endif
335 mova [dqcoeffq+ncoeffq*2+ 0], m14 336 mova [r4q+ncoeffq*2+ 0], m14
336 mova [dqcoeffq+ncoeffq*2+16], m13 337 mova [r4q+ncoeffq*2+16], m13
337 pcmpeqw m14, m5 ; m14 = c[i] == 0 338 pcmpeqw m14, m5 ; m14 = c[i] == 0
338 pcmpeqw m13, m5 ; m13 = c[i] == 0 339 pcmpeqw m13, m5 ; m13 = c[i] == 0
339 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 340 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
340 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 341 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
341 psubw m6, m7 ; m6 = scan[i] + 1 342 psubw m6, m7 ; m6 = scan[i] + 1
342 psubw m11, m7 ; m11 = scan[i] + 1 343 psubw m11, m7 ; m11 = scan[i] + 1
343 pandn m14, m6 ; m14 = max(eob) 344 pandn m14, m6 ; m14 = max(eob)
344 pandn m13, m11 ; m13 = max(eob) 345 pandn m13, m11 ; m13 = max(eob)
345 pmaxsw m8, m14 346 pmaxsw m8, m14
346 pmaxsw m8, m13 347 pmaxsw m8, m13
347 add ncoeffq, mmsize 348 add ncoeffq, mmsize
348 jl .ac_only_loop 349 jl .ac_only_loop
349 350
350 %ifidn %1, fp_32x32 351 %ifidn %1, fp_32x32
351 jmp .accumulate_eob 352 jmp .accumulate_eob
352 .skip_iter: 353 .skip_iter:
353 mova [qcoeffq+ncoeffq*2+ 0], m5 354 mova [r3q+ncoeffq*2+ 0], m5
354 mova [qcoeffq+ncoeffq*2+16], m5 355 mova [r3q+ncoeffq*2+16], m5
355 mova [dqcoeffq+ncoeffq*2+ 0], m5 356 mova [r4q+ncoeffq*2+ 0], m5
356 mova [dqcoeffq+ncoeffq*2+16], m5 357 mova [r4q+ncoeffq*2+16], m5
357 add ncoeffq, mmsize 358 add ncoeffq, mmsize
358 jl .ac_only_loop 359 jl .ac_only_loop
359 %endif 360 %endif
360 361
361 .accumulate_eob: 362 .accumulate_eob:
362 ; horizontally accumulate/max eobs and write into [eob] memory pointer 363 ; horizontally accumulate/max eobs and write into [eob] memory pointer
363 mov r2, eobmp 364 mov r2, eobmp
364 pshufd m7, m8, 0xe 365 pshufd m7, m8, 0xe
365 pmaxsw m8, m7 366 pmaxsw m8, m7
366 pshuflw m7, m8, 0xe 367 pshuflw m7, m8, 0xe
367 pmaxsw m8, m7 368 pmaxsw m8, m7
368 pshuflw m7, m8, 0x1 369 pshuflw m7, m8, 0x1
369 pmaxsw m8, m7 370 pmaxsw m8, m7
370 pextrw r6, m8, 0 371 pextrw r6, m8, 0
371 mov [r2], r6 372 mov [r2], r6
372 RET 373 RET
373 374
374 ; skip-block, i.e. just write all zeroes 375 ; skip-block, i.e. just write all zeroes
375 .blank: 376 .blank:
376 mov r0, dqcoeffmp 377 mov r0, dqcoeffmp
377 movifnidn ncoeffq, ncoeffmp 378 movifnidn ncoeffq, ncoeffmp
378 mov r2, qcoeffmp 379 mov r2, qcoeffmp
379 mov r3, eobmp 380 mov r3, eobmp
380 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 381
381 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 382 lea r0q, [r0q+ncoeffq*2]
382 lea qcoeffq, [ qcoeffq+ncoeffq*2] 383 lea r2q, [r2q+ncoeffq*2]
383 neg ncoeffq 384 neg ncoeffq
384 pxor m7, m7 385 pxor m7, m7
385 .blank_loop: 386 .blank_loop:
386 mova [dqcoeffq+ncoeffq*2+ 0], m7 387 mova [r0q+ncoeffq*2+ 0], m7
387 mova [dqcoeffq+ncoeffq*2+16], m7 388 mova [r0q+ncoeffq*2+16], m7
388 mova [qcoeffq+ncoeffq*2+ 0], m7 389 mova [r2q+ncoeffq*2+ 0], m7
389 mova [qcoeffq+ncoeffq*2+16], m7 390 mova [r2q+ncoeffq*2+16], m7
390 add ncoeffq, mmsize 391 add ncoeffq, mmsize
391 jl .blank_loop 392 jl .blank_loop
392 mov word [eobq], 0 393 mov word [r3q], 0
393 RET 394 RET
394 %endmacro 395 %endmacro
395 396
396 INIT_XMM ssse3 397 INIT_XMM ssse3
397 QUANTIZE_FP fp, 7 398 QUANTIZE_FP fp, 7
398 QUANTIZE_FP fp_32x32, 7 399 QUANTIZE_FP fp_32x32, 7
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698