source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm - Issue 958693004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

Issue 958693004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

11 %include "third_party/x86inc/x86inc.asm"	11 %include "third_party/x86inc/x86inc.asm"

12	12

13 SECTION_RODATA	13 SECTION_RODATA

14 pw_1: times 8 dw 1	14 pw_1: times 8 dw 1

15	15

16 SECTION .text	16 SECTION .text

17	17

	18 ; TODO(yunqingwang)fix quantize_b code for skip=1 case.

18 %macro QUANTIZE_FN 2	19 %macro QUANTIZE_FN 2

19 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \	20 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

20 shift, qcoeff, dqcoeff, dequant, \	21 shift, qcoeff, dqcoeff, dequant, \

21 eob, scan, iscan	22 eob, scan, iscan

22 cmp dword skipm, 0	23 cmp dword skipm, 0

23 jne .blank	24 jne .blank

24	25

25 ; actual quantize loop - setup pointers, rounders, etc.	26 ; actual quantize loop - setup pointers, rounders, etc.

26 movifnidn coeffq, coeffmp	27 movifnidn coeffq, coeffmp

27 movifnidn ncoeffq, ncoeffmp	28 movifnidn ncoeffq, ncoeffmp

(...skipping 209 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
237 psrlw m1, 1 ; m1 = (m1 + 1) / 2	238 psrlw m1, 1 ; m1 = (m1 + 1) / 2

238 %endif	239 %endif

239 mova m3, [r2q] ; m3 = dequant	240 mova m3, [r2q] ; m3 = dequant

240 mov r3, qcoeffmp	241 mov r3, qcoeffmp

241 mov r4, dqcoeffmp	242 mov r4, dqcoeffmp

242 mov r5, iscanmp	243 mov r5, iscanmp

243 %ifidn %1, fp_32x32	244 %ifidn %1, fp_32x32

244 psllw m2, 1	245 psllw m2, 1

245 %endif	246 %endif

246 pxor m5, m5 ; m5 = dedicated zero	247 pxor m5, m5 ; m5 = dedicated zero

247 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob	248

248 lea coeffq, [ coeffq+ncoeffq*2]	249 lea coeffq, [ coeffq+ncoeffq*2]

249 lea iscanq, [ iscanq+ncoeffq*2]	250 lea r5q, [ r5q+ncoeffq*2]

250 lea qcoeffq, [ qcoeffq+ncoeffq*2]	251 lea r3q, [ r3q+ncoeffq*2]

251 lea dqcoeffq, [dqcoeffq+ncoeffq*2]	252 lea r4q, [r4q+ncoeffq*2]

252 neg ncoeffq	253 neg ncoeffq

253	254

254 ; get DC and first 15 AC coeffs	255 ; get DC and first 15 AC coeffs

255 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]	256 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]

256 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]	257 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]

257 pabsw m6, m9 ; m6 = abs(m9)	258 pabsw m6, m9 ; m6 = abs(m9)

258 pabsw m11, m10 ; m11 = abs(m10)	259 pabsw m11, m10 ; m11 = abs(m10)

259 pcmpeqw m7, m7	260 pcmpeqw m7, m7

260	261

261 paddsw m6, m1 ; m6 += round	262 paddsw m6, m1 ; m6 += round

262 punpckhqdq m1, m1	263 punpckhqdq m1, m1

263 paddsw m11, m1 ; m11 += round	264 paddsw m11, m1 ; m11 += round

264 pmulhw m8, m6, m2 ; m8 = m6*q>>16	265 pmulhw m8, m6, m2 ; m8 = m6*q>>16

265 punpckhqdq m2, m2	266 punpckhqdq m2, m2

266 pmulhw m13, m11, m2 ; m13 = m11*q>>16	267 pmulhw m13, m11, m2 ; m13 = m11*q>>16

267 psignw m8, m9 ; m8 = reinsert sign	268 psignw m8, m9 ; m8 = reinsert sign

268 psignw m13, m10 ; m13 = reinsert sign	269 psignw m13, m10 ; m13 = reinsert sign

269 mova [qcoeffq+ncoeffq*2+ 0], m8	270 mova [r3q+ncoeffq*2+ 0], m8

270 mova [qcoeffq+ncoeffq*2+16], m13	271 mova [r3q+ncoeffq*2+16], m13

271 %ifidn %1, fp_32x32	272 %ifidn %1, fp_32x32

272 pabsw m8, m8	273 pabsw m8, m8

273 pabsw m13, m13	274 pabsw m13, m13

274 %endif	275 %endif

275 pmullw m8, m3 ; dqc[i] = qc[i] * q	276 pmullw m8, m3 ; r4[i] = r3[i] * q

276 punpckhqdq m3, m3	277 punpckhqdq m3, m3

277 pmullw m13, m3 ; dqc[i] = qc[i] * q	278 pmullw m13, m3 ; r4[i] = r3[i] * q

278 %ifidn %1, fp_32x32	279 %ifidn %1, fp_32x32

279 psrlw m8, 1	280 psrlw m8, 1

280 psrlw m13, 1	281 psrlw m13, 1

281 psignw m8, m9	282 psignw m8, m9

282 psignw m13, m10	283 psignw m13, m10

283 psrlw m0, m3, 2	284 psrlw m0, m3, 2

284 %endif	285 %endif

285 mova [dqcoeffq+ncoeffq*2+ 0], m8	286 mova [r4q+ncoeffq*2+ 0], m8

286 mova [dqcoeffq+ncoeffq*2+16], m13	287 mova [r4q+ncoeffq*2+16], m13

287 pcmpeqw m8, m5 ; m8 = c[i] == 0	288 pcmpeqw m8, m5 ; m8 = c[i] == 0

288 pcmpeqw m13, m5 ; m13 = c[i] == 0	289 pcmpeqw m13, m5 ; m13 = c[i] == 0

289 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]	290 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]

290 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]	291 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]

291 psubw m6, m7 ; m6 = scan[i] + 1	292 psubw m6, m7 ; m6 = scan[i] + 1

292 psubw m11, m7 ; m11 = scan[i] + 1	293 psubw m11, m7 ; m11 = scan[i] + 1

293 pandn m8, m6 ; m8 = max(eob)	294 pandn m8, m6 ; m8 = max(eob)

294 pandn m13, m11 ; m13 = max(eob)	295 pandn m13, m11 ; m13 = max(eob)

295 pmaxsw m8, m13	296 pmaxsw m8, m13

296 add ncoeffq, mmsize	297 add ncoeffq, mmsize

297 jz .accumulate_eob	298 jz .accumulate_eob

298	299

299 .ac_only_loop:	300 .ac_only_loop:

300 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]	301 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]

(...skipping 10 matching lines...) Expand all Loading...
311 jz .skip_iter	312 jz .skip_iter

312 %endif	313 %endif

313 pcmpeqw m7, m7	314 pcmpeqw m7, m7

314	315

315 paddsw m6, m1 ; m6 += round	316 paddsw m6, m1 ; m6 += round

316 paddsw m11, m1 ; m11 += round	317 paddsw m11, m1 ; m11 += round

317 pmulhw m14, m6, m2 ; m14 = m6*q>>16	318 pmulhw m14, m6, m2 ; m14 = m6*q>>16

318 pmulhw m13, m11, m2 ; m13 = m11*q>>16	319 pmulhw m13, m11, m2 ; m13 = m11*q>>16

319 psignw m14, m9 ; m14 = reinsert sign	320 psignw m14, m9 ; m14 = reinsert sign

320 psignw m13, m10 ; m13 = reinsert sign	321 psignw m13, m10 ; m13 = reinsert sign

321 mova [qcoeffq+ncoeffq*2+ 0], m14	322 mova [r3q+ncoeffq*2+ 0], m14

322 mova [qcoeffq+ncoeffq*2+16], m13	323 mova [r3q+ncoeffq*2+16], m13

323 %ifidn %1, fp_32x32	324 %ifidn %1, fp_32x32

324 pabsw m14, m14	325 pabsw m14, m14

325 pabsw m13, m13	326 pabsw m13, m13

326 %endif	327 %endif

327 pmullw m14, m3 ; dqc[i] = qc[i] * q	328 pmullw m14, m3 ; r4[i] = r3[i] * q

328 pmullw m13, m3 ; dqc[i] = qc[i] * q	329 pmullw m13, m3 ; r4[i] = r3[i] * q

329 %ifidn %1, fp_32x32	330 %ifidn %1, fp_32x32

330 psrlw m14, 1	331 psrlw m14, 1

331 psrlw m13, 1	332 psrlw m13, 1

332 psignw m14, m9	333 psignw m14, m9

333 psignw m13, m10	334 psignw m13, m10

334 %endif	335 %endif

335 mova [dqcoeffq+ncoeffq*2+ 0], m14	336 mova [r4q+ncoeffq*2+ 0], m14

336 mova [dqcoeffq+ncoeffq*2+16], m13	337 mova [r4q+ncoeffq*2+16], m13

337 pcmpeqw m14, m5 ; m14 = c[i] == 0	338 pcmpeqw m14, m5 ; m14 = c[i] == 0

338 pcmpeqw m13, m5 ; m13 = c[i] == 0	339 pcmpeqw m13, m5 ; m13 = c[i] == 0

339 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]	340 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]

340 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]	341 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]

341 psubw m6, m7 ; m6 = scan[i] + 1	342 psubw m6, m7 ; m6 = scan[i] + 1

342 psubw m11, m7 ; m11 = scan[i] + 1	343 psubw m11, m7 ; m11 = scan[i] + 1

343 pandn m14, m6 ; m14 = max(eob)	344 pandn m14, m6 ; m14 = max(eob)

344 pandn m13, m11 ; m13 = max(eob)	345 pandn m13, m11 ; m13 = max(eob)

345 pmaxsw m8, m14	346 pmaxsw m8, m14

346 pmaxsw m8, m13	347 pmaxsw m8, m13

347 add ncoeffq, mmsize	348 add ncoeffq, mmsize

348 jl .ac_only_loop	349 jl .ac_only_loop

349	350

350 %ifidn %1, fp_32x32	351 %ifidn %1, fp_32x32

351 jmp .accumulate_eob	352 jmp .accumulate_eob

352 .skip_iter:	353 .skip_iter:

353 mova [qcoeffq+ncoeffq*2+ 0], m5	354 mova [r3q+ncoeffq*2+ 0], m5

354 mova [qcoeffq+ncoeffq*2+16], m5	355 mova [r3q+ncoeffq*2+16], m5

355 mova [dqcoeffq+ncoeffq*2+ 0], m5	356 mova [r4q+ncoeffq*2+ 0], m5

356 mova [dqcoeffq+ncoeffq*2+16], m5	357 mova [r4q+ncoeffq*2+16], m5

357 add ncoeffq, mmsize	358 add ncoeffq, mmsize

358 jl .ac_only_loop	359 jl .ac_only_loop

359 %endif	360 %endif

360	361

361 .accumulate_eob:	362 .accumulate_eob:

362 ; horizontally accumulate/max eobs and write into [eob] memory pointer	363 ; horizontally accumulate/max eobs and write into [eob] memory pointer

363 mov r2, eobmp	364 mov r2, eobmp

364 pshufd m7, m8, 0xe	365 pshufd m7, m8, 0xe

365 pmaxsw m8, m7	366 pmaxsw m8, m7

366 pshuflw m7, m8, 0xe	367 pshuflw m7, m8, 0xe

367 pmaxsw m8, m7	368 pmaxsw m8, m7

368 pshuflw m7, m8, 0x1	369 pshuflw m7, m8, 0x1

369 pmaxsw m8, m7	370 pmaxsw m8, m7

370 pextrw r6, m8, 0	371 pextrw r6, m8, 0

371 mov [r2], r6	372 mov [r2], r6

372 RET	373 RET

373	374

374 ; skip-block, i.e. just write all zeroes	375 ; skip-block, i.e. just write all zeroes

375 .blank:	376 .blank:

376 mov r0, dqcoeffmp	377 mov r0, dqcoeffmp

377 movifnidn ncoeffq, ncoeffmp	378 movifnidn ncoeffq, ncoeffmp

378 mov r2, qcoeffmp	379 mov r2, qcoeffmp

379 mov r3, eobmp	380 mov r3, eobmp

380 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob	381

381 lea dqcoeffq, [dqcoeffq+ncoeffq*2]	382 lea r0q, [r0q+ncoeffq*2]

382 lea qcoeffq, [ qcoeffq+ncoeffq*2]	383 lea r2q, [r2q+ncoeffq*2]

383 neg ncoeffq	384 neg ncoeffq

384 pxor m7, m7	385 pxor m7, m7

385 .blank_loop:	386 .blank_loop:

386 mova [dqcoeffq+ncoeffq*2+ 0], m7	387 mova [r0q+ncoeffq*2+ 0], m7

387 mova [dqcoeffq+ncoeffq*2+16], m7	388 mova [r0q+ncoeffq*2+16], m7

388 mova [qcoeffq+ncoeffq*2+ 0], m7	389 mova [r2q+ncoeffq*2+ 0], m7

389 mova [qcoeffq+ncoeffq*2+16], m7	390 mova [r2q+ncoeffq*2+16], m7

390 add ncoeffq, mmsize	391 add ncoeffq, mmsize

391 jl .blank_loop	392 jl .blank_loop

392 mov word [eobq], 0	393 mov word [r3q], 0

393 RET	394 RET

394 %endmacro	395 %endmacro

395	396

396 INIT_XMM ssse3	397 INIT_XMM ssse3

397 QUANTIZE_FP fp, 7	398 QUANTIZE_FP fp, 7

398 QUANTIZE_FP fp_32x32, 7	399 QUANTIZE_FP fp_32x32, 7

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c » ('j') | no next file with comments »