Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(269)

Side by Side Diff: source/row_win.cc

Issue 1398633002: change all pix parameters to width for consistency (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_neon64.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 309 matching lines...) Expand 10 before | Expand all | Expand 10 after
320 }; 320 };
321 321
322 // NV21 shuf 8 VU to 16 UV. 322 // NV21 shuf 8 VU to 16 UV.
323 static const lvec8 kShuffleNV21 = { 323 static const lvec8 kShuffleNV21 = {
324 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 324 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
325 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 325 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
326 }; 326 };
327 327
328 // Duplicates gray value 3 times and fills in alpha opaque. 328 // Duplicates gray value 3 times and fills in alpha opaque.
329 __declspec(naked) 329 __declspec(naked)
330 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 330 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
331 __asm { 331 __asm {
332 mov eax, [esp + 4] // src_y 332 mov eax, [esp + 4] // src_y
333 mov edx, [esp + 8] // dst_argb 333 mov edx, [esp + 8] // dst_argb
334 mov ecx, [esp + 12] // pix 334 mov ecx, [esp + 12] // width
335 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 335 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
336 pslld xmm5, 24 336 pslld xmm5, 24
337 337
338 convertloop: 338 convertloop:
339 movq xmm0, qword ptr [eax] 339 movq xmm0, qword ptr [eax]
340 lea eax, [eax + 8] 340 lea eax, [eax + 8]
341 punpcklbw xmm0, xmm0 341 punpcklbw xmm0, xmm0
342 movdqa xmm1, xmm0 342 movdqa xmm1, xmm0
343 punpcklwd xmm0, xmm0 343 punpcklwd xmm0, xmm0
344 punpckhwd xmm1, xmm1 344 punpckhwd xmm1, xmm1
345 por xmm0, xmm5 345 por xmm0, xmm5
346 por xmm1, xmm5 346 por xmm1, xmm5
347 movdqu [edx], xmm0 347 movdqu [edx], xmm0
348 movdqu [edx + 16], xmm1 348 movdqu [edx + 16], xmm1
349 lea edx, [edx + 32] 349 lea edx, [edx + 32]
350 sub ecx, 8 350 sub ecx, 8
351 jg convertloop 351 jg convertloop
352 ret 352 ret
353 } 353 }
354 } 354 }
355 355
356 #ifdef HAS_J400TOARGBROW_AVX2 356 #ifdef HAS_J400TOARGBROW_AVX2
357 // Duplicates gray value 3 times and fills in alpha opaque. 357 // Duplicates gray value 3 times and fills in alpha opaque.
358 __declspec(naked) 358 __declspec(naked)
359 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { 359 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
360 __asm { 360 __asm {
361 mov eax, [esp + 4] // src_y 361 mov eax, [esp + 4] // src_y
362 mov edx, [esp + 8] // dst_argb 362 mov edx, [esp + 8] // dst_argb
363 mov ecx, [esp + 12] // pix 363 mov ecx, [esp + 12] // width
364 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 364 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
365 vpslld ymm5, ymm5, 24 365 vpslld ymm5, ymm5, 24
366 366
367 convertloop: 367 convertloop:
368 vmovdqu xmm0, [eax] 368 vmovdqu xmm0, [eax]
369 lea eax, [eax + 16] 369 lea eax, [eax + 16]
370 vpermq ymm0, ymm0, 0xd8 370 vpermq ymm0, ymm0, 0xd8
371 vpunpcklbw ymm0, ymm0, ymm0 371 vpunpcklbw ymm0, ymm0, ymm0
372 vpermq ymm0, ymm0, 0xd8 372 vpermq ymm0, ymm0, 0xd8
373 vpunpckhwd ymm1, ymm0, ymm0 373 vpunpckhwd ymm1, ymm0, ymm0
374 vpunpcklwd ymm0, ymm0, ymm0 374 vpunpcklwd ymm0, ymm0, ymm0
375 vpor ymm0, ymm0, ymm5 375 vpor ymm0, ymm0, ymm5
376 vpor ymm1, ymm1, ymm5 376 vpor ymm1, ymm1, ymm5
377 vmovdqu [edx], ymm0 377 vmovdqu [edx], ymm0
378 vmovdqu [edx + 32], ymm1 378 vmovdqu [edx + 32], ymm1
379 lea edx, [edx + 64] 379 lea edx, [edx + 64]
380 sub ecx, 16 380 sub ecx, 16
381 jg convertloop 381 jg convertloop
382 vzeroupper 382 vzeroupper
383 ret 383 ret
384 } 384 }
385 } 385 }
386 #endif // HAS_J400TOARGBROW_AVX2 386 #endif // HAS_J400TOARGBROW_AVX2
387 387
388 __declspec(naked) 388 __declspec(naked)
389 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 389 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
390 __asm { 390 __asm {
391 mov eax, [esp + 4] // src_rgb24 391 mov eax, [esp + 4] // src_rgb24
392 mov edx, [esp + 8] // dst_argb 392 mov edx, [esp + 8] // dst_argb
393 mov ecx, [esp + 12] // pix 393 mov ecx, [esp + 12] // width
394 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 394 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
395 pslld xmm5, 24 395 pslld xmm5, 24
396 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB 396 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
397 397
398 convertloop: 398 convertloop:
399 movdqu xmm0, [eax] 399 movdqu xmm0, [eax]
400 movdqu xmm1, [eax + 16] 400 movdqu xmm1, [eax + 16]
401 movdqu xmm3, [eax + 32] 401 movdqu xmm3, [eax + 32]
402 lea eax, [eax + 48] 402 lea eax, [eax + 48]
403 movdqa xmm2, xmm3 403 movdqa xmm2, xmm3
(...skipping 14 matching lines...) Expand all
418 movdqu [edx + 48], xmm3 418 movdqu [edx + 48], xmm3
419 lea edx, [edx + 64] 419 lea edx, [edx + 64]
420 sub ecx, 16 420 sub ecx, 16
421 jg convertloop 421 jg convertloop
422 ret 422 ret
423 } 423 }
424 } 424 }
425 425
426 __declspec(naked) 426 __declspec(naked)
427 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 427 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
428 int pix) { 428 int width) {
429 __asm { 429 __asm {
430 mov eax, [esp + 4] // src_raw 430 mov eax, [esp + 4] // src_raw
431 mov edx, [esp + 8] // dst_argb 431 mov edx, [esp + 8] // dst_argb
432 mov ecx, [esp + 12] // pix 432 mov ecx, [esp + 12] // width
433 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 433 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
434 pslld xmm5, 24 434 pslld xmm5, 24
435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB 435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
436 436
437 convertloop: 437 convertloop:
438 movdqu xmm0, [eax] 438 movdqu xmm0, [eax]
439 movdqu xmm1, [eax + 16] 439 movdqu xmm1, [eax + 16]
440 movdqu xmm3, [eax + 32] 440 movdqu xmm3, [eax + 32]
441 lea eax, [eax + 48] 441 lea eax, [eax + 48]
442 movdqa xmm2, xmm3 442 movdqa xmm2, xmm3
(...skipping 21 matching lines...) Expand all
464 464
465 // pmul method to replicate bits. 465 // pmul method to replicate bits.
466 // Math to replicate bits: 466 // Math to replicate bits:
467 // (v << 8) | (v << 3) 467 // (v << 8) | (v << 3)
468 // v * 256 + v * 8 468 // v * 256 + v * 8
469 // v * (256 + 8) 469 // v * (256 + 8)
470 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 470 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
471 // 20 instructions. 471 // 20 instructions.
472 __declspec(naked) 472 __declspec(naked)
473 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 473 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
474 int pix) { 474 int width) {
475 __asm { 475 __asm {
476 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 476 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
477 movd xmm5, eax 477 movd xmm5, eax
478 pshufd xmm5, xmm5, 0 478 pshufd xmm5, xmm5, 0
479 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 479 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
480 movd xmm6, eax 480 movd xmm6, eax
481 pshufd xmm6, xmm6, 0 481 pshufd xmm6, xmm6, 0
482 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 482 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
483 psllw xmm3, 11 483 psllw xmm3, 11
484 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 484 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
485 psllw xmm4, 10 485 psllw xmm4, 10
486 psrlw xmm4, 5 486 psrlw xmm4, 5
487 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 487 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
488 psllw xmm7, 8 488 psllw xmm7, 8
489 489
490 mov eax, [esp + 4] // src_rgb565 490 mov eax, [esp + 4] // src_rgb565
491 mov edx, [esp + 8] // dst_argb 491 mov edx, [esp + 8] // dst_argb
492 mov ecx, [esp + 12] // pix 492 mov ecx, [esp + 12] // width
493 sub edx, eax 493 sub edx, eax
494 sub edx, eax 494 sub edx, eax
495 495
496 convertloop: 496 convertloop:
497 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 497 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
498 movdqa xmm1, xmm0 498 movdqa xmm1, xmm0
499 movdqa xmm2, xmm0 499 movdqa xmm2, xmm0
500 pand xmm1, xmm3 // R in upper 5 bits 500 pand xmm1, xmm3 // R in upper 5 bits
501 psllw xmm2, 11 // B in upper 5 bits 501 psllw xmm2, 11 // B in upper 5 bits
502 pmulhuw xmm1, xmm5 // * (256 + 8) 502 pmulhuw xmm1, xmm5 // * (256 + 8)
(...skipping 17 matching lines...) Expand all
520 520
521 #ifdef HAS_RGB565TOARGBROW_AVX2 521 #ifdef HAS_RGB565TOARGBROW_AVX2
522 // pmul method to replicate bits. 522 // pmul method to replicate bits.
523 // Math to replicate bits: 523 // Math to replicate bits:
524 // (v << 8) | (v << 3) 524 // (v << 8) | (v << 3)
525 // v * 256 + v * 8 525 // v * 256 + v * 8
526 // v * (256 + 8) 526 // v * (256 + 8)
527 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 527 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
528 __declspec(naked) 528 __declspec(naked)
529 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, 529 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
530 int pix) { 530 int width) {
531 __asm { 531 __asm {
532 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 532 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
533 vmovd xmm5, eax 533 vmovd xmm5, eax
534 vbroadcastss ymm5, xmm5 534 vbroadcastss ymm5, xmm5
535 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 535 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
536 movd xmm6, eax 536 movd xmm6, eax
537 vbroadcastss ymm6, xmm6 537 vbroadcastss ymm6, xmm6
538 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 538 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
539 vpsllw ymm3, ymm3, 11 539 vpsllw ymm3, ymm3, 11
540 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green 540 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
541 vpsllw ymm4, ymm4, 10 541 vpsllw ymm4, ymm4, 10
542 vpsrlw ymm4, ymm4, 5 542 vpsrlw ymm4, ymm4, 5
543 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 543 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
544 vpsllw ymm7, ymm7, 8 544 vpsllw ymm7, ymm7, 8
545 545
546 mov eax, [esp + 4] // src_rgb565 546 mov eax, [esp + 4] // src_rgb565
547 mov edx, [esp + 8] // dst_argb 547 mov edx, [esp + 8] // dst_argb
548 mov ecx, [esp + 12] // pix 548 mov ecx, [esp + 12] // width
549 sub edx, eax 549 sub edx, eax
550 sub edx, eax 550 sub edx, eax
551 551
552 convertloop: 552 convertloop:
553 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 553 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
554 vpand ymm1, ymm0, ymm3 // R in upper 5 bits 554 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
555 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 555 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
556 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 556 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
557 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 557 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
558 vpsllw ymm1, ymm1, 8 558 vpsllw ymm1, ymm1, 8
(...skipping 12 matching lines...) Expand all
571 jg convertloop 571 jg convertloop
572 vzeroupper 572 vzeroupper
573 ret 573 ret
574 } 574 }
575 } 575 }
576 #endif // HAS_RGB565TOARGBROW_AVX2 576 #endif // HAS_RGB565TOARGBROW_AVX2
577 577
578 #ifdef HAS_ARGB1555TOARGBROW_AVX2 578 #ifdef HAS_ARGB1555TOARGBROW_AVX2
579 __declspec(naked) 579 __declspec(naked)
580 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, 580 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
581 int pix) { 581 int width) {
582 __asm { 582 __asm {
583 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 583 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
584 vmovd xmm5, eax 584 vmovd xmm5, eax
585 vbroadcastss ymm5, xmm5 585 vbroadcastss ymm5, xmm5
586 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 586 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
587 movd xmm6, eax 587 movd xmm6, eax
588 vbroadcastss ymm6, xmm6 588 vbroadcastss ymm6, xmm6
589 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 589 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
590 vpsllw ymm3, ymm3, 11 590 vpsllw ymm3, ymm3, 11
591 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green 591 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
592 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 592 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
593 vpsllw ymm7, ymm7, 8 593 vpsllw ymm7, ymm7, 8
594 594
595 mov eax, [esp + 4] // src_argb1555 595 mov eax, [esp + 4] // src_argb1555
596 mov edx, [esp + 8] // dst_argb 596 mov edx, [esp + 8] // dst_argb
597 mov ecx, [esp + 12] // pix 597 mov ecx, [esp + 12] // width
598 sub edx, eax 598 sub edx, eax
599 sub edx, eax 599 sub edx, eax
600 600
601 convertloop: 601 convertloop:
602 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 602 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
603 vpsllw ymm1, ymm0, 1 // R in upper 5 bits 603 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
604 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 604 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
605 vpand ymm1, ymm1, ymm3 605 vpand ymm1, ymm1, ymm3
606 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 606 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
607 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 607 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
(...skipping 15 matching lines...) Expand all
623 jg convertloop 623 jg convertloop
624 vzeroupper 624 vzeroupper
625 ret 625 ret
626 } 626 }
627 } 627 }
628 #endif // HAS_ARGB1555TOARGBROW_AVX2 628 #endif // HAS_ARGB1555TOARGBROW_AVX2
629 629
630 #ifdef HAS_ARGB4444TOARGBROW_AVX2 630 #ifdef HAS_ARGB4444TOARGBROW_AVX2
631 __declspec(naked) 631 __declspec(naked)
632 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, 632 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
633 int pix) { 633 int width) {
634 __asm { 634 __asm {
635 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 635 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
636 vmovd xmm4, eax 636 vmovd xmm4, eax
637 vbroadcastss ymm4, xmm4 637 vbroadcastss ymm4, xmm4
638 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles 638 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
639 mov eax, [esp + 4] // src_argb4444 639 mov eax, [esp + 4] // src_argb4444
640 mov edx, [esp + 8] // dst_argb 640 mov edx, [esp + 8] // dst_argb
641 mov ecx, [esp + 12] // pix 641 mov ecx, [esp + 12] // width
642 sub edx, eax 642 sub edx, eax
643 sub edx, eax 643 sub edx, eax
644 644
645 convertloop: 645 convertloop:
646 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 646 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
647 vpand ymm2, ymm0, ymm5 // mask high nibbles 647 vpand ymm2, ymm0, ymm5 // mask high nibbles
648 vpand ymm0, ymm0, ymm4 // mask low nibbles 648 vpand ymm0, ymm0, ymm4 // mask low nibbles
649 vpsrlw ymm3, ymm2, 4 649 vpsrlw ymm3, ymm2, 4
650 vpsllw ymm1, ymm0, 4 650 vpsllw ymm1, ymm0, 4
651 vpor ymm2, ymm2, ymm3 651 vpor ymm2, ymm2, ymm3
652 vpor ymm0, ymm0, ymm1 652 vpor ymm0, ymm0, ymm1
653 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 653 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
654 vpermq ymm2, ymm2, 0xd8 654 vpermq ymm2, ymm2, 0xd8
655 vpunpckhbw ymm1, ymm0, ymm2 655 vpunpckhbw ymm1, ymm0, ymm2
656 vpunpcklbw ymm0, ymm0, ymm2 656 vpunpcklbw ymm0, ymm0, ymm2
657 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB 657 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
658 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB 658 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
659 lea eax, [eax + 32] 659 lea eax, [eax + 32]
660 sub ecx, 16 660 sub ecx, 16
661 jg convertloop 661 jg convertloop
662 vzeroupper 662 vzeroupper
663 ret 663 ret
664 } 664 }
665 } 665 }
666 #endif // HAS_ARGB4444TOARGBROW_AVX2 666 #endif // HAS_ARGB4444TOARGBROW_AVX2
667 667
668 // 24 instructions 668 // 24 instructions
669 __declspec(naked) 669 __declspec(naked)
670 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 670 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
671 int pix) { 671 int width) {
672 __asm { 672 __asm {
673 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 673 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
674 movd xmm5, eax 674 movd xmm5, eax
675 pshufd xmm5, xmm5, 0 675 pshufd xmm5, xmm5, 0
676 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 676 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
677 movd xmm6, eax 677 movd xmm6, eax
678 pshufd xmm6, xmm6, 0 678 pshufd xmm6, xmm6, 0
679 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 679 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
680 psllw xmm3, 11 680 psllw xmm3, 11
681 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 681 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
682 psrlw xmm4, 6 682 psrlw xmm4, 6
683 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 683 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
684 psllw xmm7, 8 684 psllw xmm7, 8
685 685
686 mov eax, [esp + 4] // src_argb1555 686 mov eax, [esp + 4] // src_argb1555
687 mov edx, [esp + 8] // dst_argb 687 mov edx, [esp + 8] // dst_argb
688 mov ecx, [esp + 12] // pix 688 mov ecx, [esp + 12] // width
689 sub edx, eax 689 sub edx, eax
690 sub edx, eax 690 sub edx, eax
691 691
692 convertloop: 692 convertloop:
693 movdqu xmm0, [eax] // fetch 8 pixels of 1555 693 movdqu xmm0, [eax] // fetch 8 pixels of 1555
694 movdqa xmm1, xmm0 694 movdqa xmm1, xmm0
695 movdqa xmm2, xmm0 695 movdqa xmm2, xmm0
696 psllw xmm1, 1 // R in upper 5 bits 696 psllw xmm1, 1 // R in upper 5 bits
697 psllw xmm2, 11 // B in upper 5 bits 697 psllw xmm2, 11 // B in upper 5 bits
698 pand xmm1, xmm3 698 pand xmm1, xmm3
(...skipping 15 matching lines...) Expand all
714 lea eax, [eax + 16] 714 lea eax, [eax + 16]
715 sub ecx, 8 715 sub ecx, 8
716 jg convertloop 716 jg convertloop
717 ret 717 ret
718 } 718 }
719 } 719 }
720 720
721 // 18 instructions. 721 // 18 instructions.
722 __declspec(naked) 722 __declspec(naked)
723 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 723 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
724 int pix) { 724 int width) {
725 __asm { 725 __asm {
726 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 726 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
727 movd xmm4, eax 727 movd xmm4, eax
728 pshufd xmm4, xmm4, 0 728 pshufd xmm4, xmm4, 0
729 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 729 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
730 pslld xmm5, 4 730 pslld xmm5, 4
731 mov eax, [esp + 4] // src_argb4444 731 mov eax, [esp + 4] // src_argb4444
732 mov edx, [esp + 8] // dst_argb 732 mov edx, [esp + 8] // dst_argb
733 mov ecx, [esp + 12] // pix 733 mov ecx, [esp + 12] // width
734 sub edx, eax 734 sub edx, eax
735 sub edx, eax 735 sub edx, eax
736 736
737 convertloop: 737 convertloop:
738 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 738 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
739 movdqa xmm2, xmm0 739 movdqa xmm2, xmm0
740 pand xmm0, xmm4 // mask low nibbles 740 pand xmm0, xmm4 // mask low nibbles
741 pand xmm2, xmm5 // mask high nibbles 741 pand xmm2, xmm5 // mask high nibbles
742 movdqa xmm1, xmm0 742 movdqa xmm1, xmm0
743 movdqa xmm3, xmm2 743 movdqa xmm3, xmm2
744 psllw xmm1, 4 744 psllw xmm1, 4
745 psrlw xmm3, 4 745 psrlw xmm3, 4
746 por xmm0, xmm1 746 por xmm0, xmm1
747 por xmm2, xmm3 747 por xmm2, xmm3
748 movdqa xmm1, xmm0 748 movdqa xmm1, xmm0
749 punpcklbw xmm0, xmm2 749 punpcklbw xmm0, xmm2
750 punpckhbw xmm1, xmm2 750 punpckhbw xmm1, xmm2
751 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 751 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
752 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 752 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
753 lea eax, [eax + 16] 753 lea eax, [eax + 16]
754 sub ecx, 8 754 sub ecx, 8
755 jg convertloop 755 jg convertloop
756 ret 756 ret
757 } 757 }
758 } 758 }
759 759
760 __declspec(naked) 760 __declspec(naked)
761 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 761 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
762 __asm { 762 __asm {
763 mov eax, [esp + 4] // src_argb 763 mov eax, [esp + 4] // src_argb
764 mov edx, [esp + 8] // dst_rgb 764 mov edx, [esp + 8] // dst_rgb
765 mov ecx, [esp + 12] // pix 765 mov ecx, [esp + 12] // width
766 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 766 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
767 767
768 convertloop: 768 convertloop:
769 movdqu xmm0, [eax] // fetch 16 pixels of argb 769 movdqu xmm0, [eax] // fetch 16 pixels of argb
770 movdqu xmm1, [eax + 16] 770 movdqu xmm1, [eax + 16]
771 movdqu xmm2, [eax + 32] 771 movdqu xmm2, [eax + 32]
772 movdqu xmm3, [eax + 48] 772 movdqu xmm3, [eax + 48]
773 lea eax, [eax + 64] 773 lea eax, [eax + 64]
774 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 774 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
775 pshufb xmm1, xmm6 775 pshufb xmm1, xmm6
(...skipping 13 matching lines...) Expand all
789 movdqu [edx + 16], xmm1 // store 1 789 movdqu [edx + 16], xmm1 // store 1
790 movdqu [edx + 32], xmm2 // store 2 790 movdqu [edx + 32], xmm2 // store 2
791 lea edx, [edx + 48] 791 lea edx, [edx + 48]
792 sub ecx, 16 792 sub ecx, 16
793 jg convertloop 793 jg convertloop
794 ret 794 ret
795 } 795 }
796 } 796 }
797 797
798 __declspec(naked) 798 __declspec(naked)
799 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 799 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
800 __asm { 800 __asm {
801 mov eax, [esp + 4] // src_argb 801 mov eax, [esp + 4] // src_argb
802 mov edx, [esp + 8] // dst_rgb 802 mov edx, [esp + 8] // dst_rgb
803 mov ecx, [esp + 12] // pix 803 mov ecx, [esp + 12] // width
804 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW 804 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
805 805
806 convertloop: 806 convertloop:
807 movdqu xmm0, [eax] // fetch 16 pixels of argb 807 movdqu xmm0, [eax] // fetch 16 pixels of argb
808 movdqu xmm1, [eax + 16] 808 movdqu xmm1, [eax + 16]
809 movdqu xmm2, [eax + 32] 809 movdqu xmm2, [eax + 32]
810 movdqu xmm3, [eax + 48] 810 movdqu xmm3, [eax + 48]
811 lea eax, [eax + 64] 811 lea eax, [eax + 64]
812 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 812 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
813 pshufb xmm1, xmm6 813 pshufb xmm1, xmm6
(...skipping 13 matching lines...) Expand all
827 movdqu [edx + 16], xmm1 // store 1 827 movdqu [edx + 16], xmm1 // store 1
828 movdqu [edx + 32], xmm2 // store 2 828 movdqu [edx + 32], xmm2 // store 2
829 lea edx, [edx + 48] 829 lea edx, [edx + 48]
830 sub ecx, 16 830 sub ecx, 16
831 jg convertloop 831 jg convertloop
832 ret 832 ret
833 } 833 }
834 } 834 }
835 835
836 __declspec(naked) 836 __declspec(naked)
837 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 837 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
838 __asm { 838 __asm {
839 mov eax, [esp + 4] // src_argb 839 mov eax, [esp + 4] // src_argb
840 mov edx, [esp + 8] // dst_rgb 840 mov edx, [esp + 8] // dst_rgb
841 mov ecx, [esp + 12] // pix 841 mov ecx, [esp + 12] // width
842 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 842 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
843 psrld xmm3, 27 843 psrld xmm3, 27
844 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 844 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
845 psrld xmm4, 26 845 psrld xmm4, 26
846 pslld xmm4, 5 846 pslld xmm4, 5
847 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 847 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
848 pslld xmm5, 11 848 pslld xmm5, 11
849 849
850 convertloop: 850 convertloop:
851 movdqu xmm0, [eax] // fetch 4 pixels of argb 851 movdqu xmm0, [eax] // fetch 4 pixels of argb
(...skipping 13 matching lines...) Expand all
865 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 865 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
866 lea edx, [edx + 8] 866 lea edx, [edx + 8]
867 sub ecx, 4 867 sub ecx, 4
868 jg convertloop 868 jg convertloop
869 ret 869 ret
870 } 870 }
871 } 871 }
872 872
873 __declspec(naked) 873 __declspec(naked)
874 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, 874 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
875 const uint32 dither4, int pix) { 875 const uint32 dither4, int width) {
876 __asm { 876 __asm {
877 877
878 mov eax, [esp + 4] // src_argb 878 mov eax, [esp + 4] // src_argb
879 mov edx, [esp + 8] // dst_rgb 879 mov edx, [esp + 8] // dst_rgb
880 movd xmm6, [esp + 12] // dither4 880 movd xmm6, [esp + 12] // dither4
881 mov ecx, [esp + 16] // pix 881 mov ecx, [esp + 16] // width
882 punpcklbw xmm6, xmm6 // make dither 16 bytes 882 punpcklbw xmm6, xmm6 // make dither 16 bytes
883 movdqa xmm7, xmm6 883 movdqa xmm7, xmm6
884 punpcklwd xmm6, xmm6 884 punpcklwd xmm6, xmm6
885 punpckhwd xmm7, xmm7 885 punpckhwd xmm7, xmm7
886 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 886 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
887 psrld xmm3, 27 887 psrld xmm3, 27
888 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 888 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
889 psrld xmm4, 26 889 psrld xmm4, 26
890 pslld xmm4, 5 890 pslld xmm4, 5
891 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 891 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
(...skipping 19 matching lines...) Expand all
911 lea edx, [edx + 8] 911 lea edx, [edx + 8]
912 sub ecx, 4 912 sub ecx, 4
913 jg convertloop 913 jg convertloop
914 ret 914 ret
915 } 915 }
916 } 916 }
917 917
918 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 918 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
919 __declspec(naked) 919 __declspec(naked)
920 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, 920 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
921 const uint32 dither4, int pix) { 921 const uint32 dither4, int width) {
922 __asm { 922 __asm {
923 mov eax, [esp + 4] // src_argb 923 mov eax, [esp + 4] // src_argb
924 mov edx, [esp + 8] // dst_rgb 924 mov edx, [esp + 8] // dst_rgb
925 vbroadcastss xmm6, [esp + 12] // dither4 925 vbroadcastss xmm6, [esp + 12] // dither4
926 mov ecx, [esp + 16] // pix 926 mov ecx, [esp + 16] // width
927 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes 927 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
928 vpermq ymm6, ymm6, 0xd8 928 vpermq ymm6, ymm6, 0xd8
929 vpunpcklwd ymm6, ymm6, ymm6 929 vpunpcklwd ymm6, ymm6, ymm6
930 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 930 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
931 vpsrld ymm3, ymm3, 27 931 vpsrld ymm3, ymm3, 27
932 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 932 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
933 vpsrld ymm4, ymm4, 26 933 vpsrld ymm4, ymm4, 26
934 vpslld ymm4, ymm4, 5 934 vpslld ymm4, ymm4, 5
935 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 935 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
936 936
(...skipping 16 matching lines...) Expand all
953 sub ecx, 8 953 sub ecx, 8
954 jg convertloop 954 jg convertloop
955 vzeroupper 955 vzeroupper
956 ret 956 ret
957 } 957 }
958 } 958 }
959 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 959 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
960 960
961 // TODO(fbarchard): Improve sign extension/packing. 961 // TODO(fbarchard): Improve sign extension/packing.
962 __declspec(naked) 962 __declspec(naked)
963 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 963 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
964 __asm { 964 __asm {
965 mov eax, [esp + 4] // src_argb 965 mov eax, [esp + 4] // src_argb
966 mov edx, [esp + 8] // dst_rgb 966 mov edx, [esp + 8] // dst_rgb
967 mov ecx, [esp + 12] // pix 967 mov ecx, [esp + 12] // width
968 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 968 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
969 psrld xmm4, 27 969 psrld xmm4, 27
970 movdqa xmm5, xmm4 // generate mask 0x000003e0 970 movdqa xmm5, xmm4 // generate mask 0x000003e0
971 pslld xmm5, 5 971 pslld xmm5, 5
972 movdqa xmm6, xmm4 // generate mask 0x00007c00 972 movdqa xmm6, xmm4 // generate mask 0x00007c00
973 pslld xmm6, 10 973 pslld xmm6, 10
974 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 974 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
975 pslld xmm7, 15 975 pslld xmm7, 15
976 976
977 convertloop: 977 convertloop:
(...skipping 16 matching lines...) Expand all
994 lea eax, [eax + 16] 994 lea eax, [eax + 16]
995 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 995 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
996 lea edx, [edx + 8] 996 lea edx, [edx + 8]
997 sub ecx, 4 997 sub ecx, 4
998 jg convertloop 998 jg convertloop
999 ret 999 ret
1000 } 1000 }
1001 } 1001 }
1002 1002
1003 __declspec(naked) 1003 __declspec(naked)
1004 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1004 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
1005 __asm { 1005 __asm {
1006 mov eax, [esp + 4] // src_argb 1006 mov eax, [esp + 4] // src_argb
1007 mov edx, [esp + 8] // dst_rgb 1007 mov edx, [esp + 8] // dst_rgb
1008 mov ecx, [esp + 12] // pix 1008 mov ecx, [esp + 12] // width
1009 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 1009 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
1010 psllw xmm4, 12 1010 psllw xmm4, 12
1011 movdqa xmm3, xmm4 // generate mask 0x00f000f0 1011 movdqa xmm3, xmm4 // generate mask 0x00f000f0
1012 psrlw xmm3, 8 1012 psrlw xmm3, 8
1013 1013
1014 convertloop: 1014 convertloop:
1015 movdqu xmm0, [eax] // fetch 4 pixels of argb 1015 movdqu xmm0, [eax] // fetch 4 pixels of argb
1016 movdqa xmm1, xmm0 1016 movdqa xmm1, xmm0
1017 pand xmm0, xmm3 // low nibble 1017 pand xmm0, xmm3 // low nibble
1018 pand xmm1, xmm4 // high nibble 1018 pand xmm1, xmm4 // high nibble
1019 psrld xmm0, 4 1019 psrld xmm0, 4
1020 psrld xmm1, 8 1020 psrld xmm1, 8
1021 por xmm0, xmm1 1021 por xmm0, xmm1
1022 packuswb xmm0, xmm0 1022 packuswb xmm0, xmm0
1023 lea eax, [eax + 16] 1023 lea eax, [eax + 16]
1024 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 1024 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
1025 lea edx, [edx + 8] 1025 lea edx, [edx + 8]
1026 sub ecx, 4 1026 sub ecx, 4
1027 jg convertloop 1027 jg convertloop
1028 ret 1028 ret
1029 } 1029 }
1030 } 1030 }
1031 1031
1032 #ifdef HAS_ARGBTORGB565ROW_AVX2 1032 #ifdef HAS_ARGBTORGB565ROW_AVX2
1033 __declspec(naked) 1033 __declspec(naked)
1034 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1034 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1035 __asm { 1035 __asm {
1036 mov eax, [esp + 4] // src_argb 1036 mov eax, [esp + 4] // src_argb
1037 mov edx, [esp + 8] // dst_rgb 1037 mov edx, [esp + 8] // dst_rgb
1038 mov ecx, [esp + 12] // pix 1038 mov ecx, [esp + 12] // width
1039 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 1039 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1040 vpsrld ymm3, ymm3, 27 1040 vpsrld ymm3, ymm3, 27
1041 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 1041 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1042 vpsrld ymm4, ymm4, 26 1042 vpsrld ymm4, ymm4, 26
1043 vpslld ymm4, ymm4, 5 1043 vpslld ymm4, ymm4, 5
1044 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 1044 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1045 1045
1046 convertloop: 1046 convertloop:
1047 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1047 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1048 vpsrld ymm2, ymm0, 5 // G 1048 vpsrld ymm2, ymm0, 5 // G
(...skipping 12 matching lines...) Expand all
1061 sub ecx, 8 1061 sub ecx, 8
1062 jg convertloop 1062 jg convertloop
1063 vzeroupper 1063 vzeroupper
1064 ret 1064 ret
1065 } 1065 }
1066 } 1066 }
1067 #endif // HAS_ARGBTORGB565ROW_AVX2 1067 #endif // HAS_ARGBTORGB565ROW_AVX2
1068 1068
1069 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 1069 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1070 __declspec(naked) 1070 __declspec(naked)
1071 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1071 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1072 __asm { 1072 __asm {
1073 mov eax, [esp + 4] // src_argb 1073 mov eax, [esp + 4] // src_argb
1074 mov edx, [esp + 8] // dst_rgb 1074 mov edx, [esp + 8] // dst_rgb
1075 mov ecx, [esp + 12] // pix 1075 mov ecx, [esp + 12] // width
1076 vpcmpeqb ymm4, ymm4, ymm4 1076 vpcmpeqb ymm4, ymm4, ymm4
1077 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f 1077 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1078 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 1078 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1079 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 1079 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1080 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 1080 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1081 vpslld ymm7, ymm7, 15 1081 vpslld ymm7, ymm7, 15
1082 1082
1083 convertloop: 1083 convertloop:
1084 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1084 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1085 vpsrld ymm3, ymm0, 9 // R 1085 vpsrld ymm3, ymm0, 9 // R
(...skipping 15 matching lines...) Expand all
1101 sub ecx, 8 1101 sub ecx, 8
1102 jg convertloop 1102 jg convertloop
1103 vzeroupper 1103 vzeroupper
1104 ret 1104 ret
1105 } 1105 }
1106 } 1106 }
1107 #endif // HAS_ARGBTOARGB1555ROW_AVX2 1107 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1108 1108
1109 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 1109 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1110 __declspec(naked) 1110 __declspec(naked)
1111 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1111 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1112 __asm { 1112 __asm {
1113 mov eax, [esp + 4] // src_argb 1113 mov eax, [esp + 4] // src_argb
1114 mov edx, [esp + 8] // dst_rgb 1114 mov edx, [esp + 8] // dst_rgb
1115 mov ecx, [esp + 12] // pix 1115 mov ecx, [esp + 12] // width
1116 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 1116 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1117 vpsllw ymm4, ymm4, 12 1117 vpsllw ymm4, ymm4, 12
1118 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 1118 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1119 1119
1120 convertloop: 1120 convertloop:
1121 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1121 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1122 vpand ymm1, ymm0, ymm4 // high nibble 1122 vpand ymm1, ymm0, ymm4 // high nibble
1123 vpand ymm0, ymm0, ymm3 // low nibble 1123 vpand ymm0, ymm0, ymm3 // low nibble
1124 vpsrld ymm1, ymm1, 8 1124 vpsrld ymm1, ymm1, 8
1125 vpsrld ymm0, ymm0, 4 1125 vpsrld ymm0, ymm0, 4
1126 vpor ymm0, ymm0, ymm1 1126 vpor ymm0, ymm0, ymm1
1127 vpackuswb ymm0, ymm0, ymm0 1127 vpackuswb ymm0, ymm0, ymm0
1128 vpermq ymm0, ymm0, 0xd8 1128 vpermq ymm0, ymm0, 0xd8
1129 lea eax, [eax + 32] 1129 lea eax, [eax + 32]
1130 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 1130 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1131 lea edx, [edx + 16] 1131 lea edx, [edx + 16]
1132 sub ecx, 8 1132 sub ecx, 8
1133 jg convertloop 1133 jg convertloop
1134 vzeroupper 1134 vzeroupper
1135 ret 1135 ret
1136 } 1136 }
1137 } 1137 }
1138 #endif // HAS_ARGBTOARGB4444ROW_AVX2 1138 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1139 1139
1140 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1140 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1141 __declspec(naked) 1141 __declspec(naked)
1142 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1142 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1143 __asm { 1143 __asm {
1144 mov eax, [esp + 4] /* src_argb */ 1144 mov eax, [esp + 4] /* src_argb */
1145 mov edx, [esp + 8] /* dst_y */ 1145 mov edx, [esp + 8] /* dst_y */
1146 mov ecx, [esp + 12] /* pix */ 1146 mov ecx, [esp + 12] /* width */
1147 movdqa xmm4, xmmword ptr kARGBToY 1147 movdqa xmm4, xmmword ptr kARGBToY
1148 movdqa xmm5, xmmword ptr kAddY16 1148 movdqa xmm5, xmmword ptr kAddY16
1149 1149
1150 convertloop: 1150 convertloop:
1151 movdqu xmm0, [eax] 1151 movdqu xmm0, [eax]
1152 movdqu xmm1, [eax + 16] 1152 movdqu xmm1, [eax + 16]
1153 movdqu xmm2, [eax + 32] 1153 movdqu xmm2, [eax + 32]
1154 movdqu xmm3, [eax + 48] 1154 movdqu xmm3, [eax + 48]
1155 pmaddubsw xmm0, xmm4 1155 pmaddubsw xmm0, xmm4
1156 pmaddubsw xmm1, xmm4 1156 pmaddubsw xmm1, xmm4
(...skipping 10 matching lines...) Expand all
1167 lea edx, [edx + 16] 1167 lea edx, [edx + 16]
1168 sub ecx, 16 1168 sub ecx, 16
1169 jg convertloop 1169 jg convertloop
1170 ret 1170 ret
1171 } 1171 }
1172 } 1172 }
1173 1173
1174 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 1174 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1175 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 1175 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1176 __declspec(naked) 1176 __declspec(naked)
1177 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1177 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1178 __asm { 1178 __asm {
1179 mov eax, [esp + 4] /* src_argb */ 1179 mov eax, [esp + 4] /* src_argb */
1180 mov edx, [esp + 8] /* dst_y */ 1180 mov edx, [esp + 8] /* dst_y */
1181 mov ecx, [esp + 12] /* pix */ 1181 mov ecx, [esp + 12] /* width */
1182 movdqa xmm4, xmmword ptr kARGBToYJ 1182 movdqa xmm4, xmmword ptr kARGBToYJ
1183 movdqa xmm5, xmmword ptr kAddYJ64 1183 movdqa xmm5, xmmword ptr kAddYJ64
1184 1184
1185 convertloop: 1185 convertloop:
1186 movdqu xmm0, [eax] 1186 movdqu xmm0, [eax]
1187 movdqu xmm1, [eax + 16] 1187 movdqu xmm1, [eax + 16]
1188 movdqu xmm2, [eax + 32] 1188 movdqu xmm2, [eax + 32]
1189 movdqu xmm3, [eax + 48] 1189 movdqu xmm3, [eax + 48]
1190 pmaddubsw xmm0, xmm4 1190 pmaddubsw xmm0, xmm4
1191 pmaddubsw xmm1, xmm4 1191 pmaddubsw xmm1, xmm4
(...skipping 16 matching lines...) Expand all
1208 } 1208 }
1209 1209
1210 #ifdef HAS_ARGBTOYROW_AVX2 1210 #ifdef HAS_ARGBTOYROW_AVX2
1211 // vpermd for vphaddw + vpackuswb vpermd. 1211 // vpermd for vphaddw + vpackuswb vpermd.
1212 static const lvec32 kPermdARGBToY_AVX = { 1212 static const lvec32 kPermdARGBToY_AVX = {
1213 0, 4, 1, 5, 2, 6, 3, 7 1213 0, 4, 1, 5, 2, 6, 3, 7
1214 }; 1214 };
1215 1215
1216 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1216 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1217 __declspec(naked) 1217 __declspec(naked)
1218 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1218 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1219 __asm { 1219 __asm {
1220 mov eax, [esp + 4] /* src_argb */ 1220 mov eax, [esp + 4] /* src_argb */
1221 mov edx, [esp + 8] /* dst_y */ 1221 mov edx, [esp + 8] /* dst_y */
1222 mov ecx, [esp + 12] /* pix */ 1222 mov ecx, [esp + 12] /* width */
1223 vbroadcastf128 ymm4, xmmword ptr kARGBToY 1223 vbroadcastf128 ymm4, xmmword ptr kARGBToY
1224 vbroadcastf128 ymm5, xmmword ptr kAddY16 1224 vbroadcastf128 ymm5, xmmword ptr kAddY16
1225 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX 1225 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1226 1226
1227 convertloop: 1227 convertloop:
1228 vmovdqu ymm0, [eax] 1228 vmovdqu ymm0, [eax]
1229 vmovdqu ymm1, [eax + 32] 1229 vmovdqu ymm1, [eax + 32]
1230 vmovdqu ymm2, [eax + 64] 1230 vmovdqu ymm2, [eax + 64]
1231 vmovdqu ymm3, [eax + 96] 1231 vmovdqu ymm3, [eax + 96]
1232 vpmaddubsw ymm0, ymm0, ymm4 1232 vpmaddubsw ymm0, ymm0, ymm4
(...skipping 14 matching lines...) Expand all
1247 jg convertloop 1247 jg convertloop
1248 vzeroupper 1248 vzeroupper
1249 ret 1249 ret
1250 } 1250 }
1251 } 1251 }
1252 #endif // HAS_ARGBTOYROW_AVX2 1252 #endif // HAS_ARGBTOYROW_AVX2
1253 1253
1254 #ifdef HAS_ARGBTOYJROW_AVX2 1254 #ifdef HAS_ARGBTOYJROW_AVX2
1255 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1255 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1256 __declspec(naked) 1256 __declspec(naked)
1257 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1257 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1258 __asm { 1258 __asm {
1259 mov eax, [esp + 4] /* src_argb */ 1259 mov eax, [esp + 4] /* src_argb */
1260 mov edx, [esp + 8] /* dst_y */ 1260 mov edx, [esp + 8] /* dst_y */
1261 mov ecx, [esp + 12] /* pix */ 1261 mov ecx, [esp + 12] /* width */
1262 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ 1262 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1263 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 1263 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1264 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX 1264 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1265 1265
1266 convertloop: 1266 convertloop:
1267 vmovdqu ymm0, [eax] 1267 vmovdqu ymm0, [eax]
1268 vmovdqu ymm1, [eax + 32] 1268 vmovdqu ymm1, [eax + 32]
1269 vmovdqu ymm2, [eax + 64] 1269 vmovdqu ymm2, [eax + 64]
1270 vmovdqu ymm3, [eax + 96] 1270 vmovdqu ymm3, [eax + 96]
1271 vpmaddubsw ymm0, ymm0, ymm4 1271 vpmaddubsw ymm0, ymm0, ymm4
(...skipping 14 matching lines...) Expand all
1286 sub ecx, 32 1286 sub ecx, 32
1287 jg convertloop 1287 jg convertloop
1288 1288
1289 vzeroupper 1289 vzeroupper
1290 ret 1290 ret
1291 } 1291 }
1292 } 1292 }
1293 #endif // HAS_ARGBTOYJROW_AVX2 1293 #endif // HAS_ARGBTOYJROW_AVX2
1294 1294
1295 __declspec(naked) 1295 __declspec(naked)
1296 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1296 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1297 __asm { 1297 __asm {
1298 mov eax, [esp + 4] /* src_argb */ 1298 mov eax, [esp + 4] /* src_argb */
1299 mov edx, [esp + 8] /* dst_y */ 1299 mov edx, [esp + 8] /* dst_y */
1300 mov ecx, [esp + 12] /* pix */ 1300 mov ecx, [esp + 12] /* width */
1301 movdqa xmm4, xmmword ptr kBGRAToY 1301 movdqa xmm4, xmmword ptr kBGRAToY
1302 movdqa xmm5, xmmword ptr kAddY16 1302 movdqa xmm5, xmmword ptr kAddY16
1303 1303
1304 convertloop: 1304 convertloop:
1305 movdqu xmm0, [eax] 1305 movdqu xmm0, [eax]
1306 movdqu xmm1, [eax + 16] 1306 movdqu xmm1, [eax + 16]
1307 movdqu xmm2, [eax + 32] 1307 movdqu xmm2, [eax + 32]
1308 movdqu xmm3, [eax + 48] 1308 movdqu xmm3, [eax + 48]
1309 pmaddubsw xmm0, xmm4 1309 pmaddubsw xmm0, xmm4
1310 pmaddubsw xmm1, xmm4 1310 pmaddubsw xmm1, xmm4
1311 pmaddubsw xmm2, xmm4 1311 pmaddubsw xmm2, xmm4
1312 pmaddubsw xmm3, xmm4 1312 pmaddubsw xmm3, xmm4
1313 lea eax, [eax + 64] 1313 lea eax, [eax + 64]
1314 phaddw xmm0, xmm1 1314 phaddw xmm0, xmm1
1315 phaddw xmm2, xmm3 1315 phaddw xmm2, xmm3
1316 psrlw xmm0, 7 1316 psrlw xmm0, 7
1317 psrlw xmm2, 7 1317 psrlw xmm2, 7
1318 packuswb xmm0, xmm2 1318 packuswb xmm0, xmm2
1319 paddb xmm0, xmm5 1319 paddb xmm0, xmm5
1320 movdqu [edx], xmm0 1320 movdqu [edx], xmm0
1321 lea edx, [edx + 16] 1321 lea edx, [edx + 16]
1322 sub ecx, 16 1322 sub ecx, 16
1323 jg convertloop 1323 jg convertloop
1324 ret 1324 ret
1325 } 1325 }
1326 } 1326 }
1327 1327
1328 __declspec(naked) 1328 __declspec(naked)
1329 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1329 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1330 __asm { 1330 __asm {
1331 mov eax, [esp + 4] /* src_argb */ 1331 mov eax, [esp + 4] /* src_argb */
1332 mov edx, [esp + 8] /* dst_y */ 1332 mov edx, [esp + 8] /* dst_y */
1333 mov ecx, [esp + 12] /* pix */ 1333 mov ecx, [esp + 12] /* width */
1334 movdqa xmm4, xmmword ptr kABGRToY 1334 movdqa xmm4, xmmword ptr kABGRToY
1335 movdqa xmm5, xmmword ptr kAddY16 1335 movdqa xmm5, xmmword ptr kAddY16
1336 1336
1337 convertloop: 1337 convertloop:
1338 movdqu xmm0, [eax] 1338 movdqu xmm0, [eax]
1339 movdqu xmm1, [eax + 16] 1339 movdqu xmm1, [eax + 16]
1340 movdqu xmm2, [eax + 32] 1340 movdqu xmm2, [eax + 32]
1341 movdqu xmm3, [eax + 48] 1341 movdqu xmm3, [eax + 48]
1342 pmaddubsw xmm0, xmm4 1342 pmaddubsw xmm0, xmm4
1343 pmaddubsw xmm1, xmm4 1343 pmaddubsw xmm1, xmm4
1344 pmaddubsw xmm2, xmm4 1344 pmaddubsw xmm2, xmm4
1345 pmaddubsw xmm3, xmm4 1345 pmaddubsw xmm3, xmm4
1346 lea eax, [eax + 64] 1346 lea eax, [eax + 64]
1347 phaddw xmm0, xmm1 1347 phaddw xmm0, xmm1
1348 phaddw xmm2, xmm3 1348 phaddw xmm2, xmm3
1349 psrlw xmm0, 7 1349 psrlw xmm0, 7
1350 psrlw xmm2, 7 1350 psrlw xmm2, 7
1351 packuswb xmm0, xmm2 1351 packuswb xmm0, xmm2
1352 paddb xmm0, xmm5 1352 paddb xmm0, xmm5
1353 movdqu [edx], xmm0 1353 movdqu [edx], xmm0
1354 lea edx, [edx + 16] 1354 lea edx, [edx + 16]
1355 sub ecx, 16 1355 sub ecx, 16
1356 jg convertloop 1356 jg convertloop
1357 ret 1357 ret
1358 } 1358 }
1359 } 1359 }
1360 1360
1361 __declspec(naked) 1361 __declspec(naked)
1362 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1362 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1363 __asm { 1363 __asm {
1364 mov eax, [esp + 4] /* src_argb */ 1364 mov eax, [esp + 4] /* src_argb */
1365 mov edx, [esp + 8] /* dst_y */ 1365 mov edx, [esp + 8] /* dst_y */
1366 mov ecx, [esp + 12] /* pix */ 1366 mov ecx, [esp + 12] /* width */
1367 movdqa xmm4, xmmword ptr kRGBAToY 1367 movdqa xmm4, xmmword ptr kRGBAToY
1368 movdqa xmm5, xmmword ptr kAddY16 1368 movdqa xmm5, xmmword ptr kAddY16
1369 1369
1370 convertloop: 1370 convertloop:
1371 movdqu xmm0, [eax] 1371 movdqu xmm0, [eax]
1372 movdqu xmm1, [eax + 16] 1372 movdqu xmm1, [eax + 16]
1373 movdqu xmm2, [eax + 32] 1373 movdqu xmm2, [eax + 32]
1374 movdqu xmm3, [eax + 48] 1374 movdqu xmm3, [eax + 48]
1375 pmaddubsw xmm0, xmm4 1375 pmaddubsw xmm0, xmm4
1376 pmaddubsw xmm1, xmm4 1376 pmaddubsw xmm1, xmm4
(...skipping 17 matching lines...) Expand all
1394 __declspec(naked) 1394 __declspec(naked)
1395 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1395 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1396 uint8* dst_u, uint8* dst_v, int width) { 1396 uint8* dst_u, uint8* dst_v, int width) {
1397 __asm { 1397 __asm {
1398 push esi 1398 push esi
1399 push edi 1399 push edi
1400 mov eax, [esp + 8 + 4] // src_argb 1400 mov eax, [esp + 8 + 4] // src_argb
1401 mov esi, [esp + 8 + 8] // src_stride_argb 1401 mov esi, [esp + 8 + 8] // src_stride_argb
1402 mov edx, [esp + 8 + 12] // dst_u 1402 mov edx, [esp + 8 + 12] // dst_u
1403 mov edi, [esp + 8 + 16] // dst_v 1403 mov edi, [esp + 8 + 16] // dst_v
1404 mov ecx, [esp + 8 + 20] // pix 1404 mov ecx, [esp + 8 + 20] // width
1405 movdqa xmm5, xmmword ptr kAddUV128 1405 movdqa xmm5, xmmword ptr kAddUV128
1406 movdqa xmm6, xmmword ptr kARGBToV 1406 movdqa xmm6, xmmword ptr kARGBToV
1407 movdqa xmm7, xmmword ptr kARGBToU 1407 movdqa xmm7, xmmword ptr kARGBToU
1408 sub edi, edx // stride from u to v 1408 sub edi, edx // stride from u to v
1409 1409
1410 convertloop: 1410 convertloop:
1411 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1411 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1412 movdqu xmm0, [eax] 1412 movdqu xmm0, [eax]
1413 movdqu xmm4, [eax + esi] 1413 movdqu xmm4, [eax + esi]
1414 pavgb xmm0, xmm4 1414 pavgb xmm0, xmm4
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1464 __declspec(naked) 1464 __declspec(naked)
1465 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1465 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1466 uint8* dst_u, uint8* dst_v, int width) { 1466 uint8* dst_u, uint8* dst_v, int width) {
1467 __asm { 1467 __asm {
1468 push esi 1468 push esi
1469 push edi 1469 push edi
1470 mov eax, [esp + 8 + 4] // src_argb 1470 mov eax, [esp + 8 + 4] // src_argb
1471 mov esi, [esp + 8 + 8] // src_stride_argb 1471 mov esi, [esp + 8 + 8] // src_stride_argb
1472 mov edx, [esp + 8 + 12] // dst_u 1472 mov edx, [esp + 8 + 12] // dst_u
1473 mov edi, [esp + 8 + 16] // dst_v 1473 mov edi, [esp + 8 + 16] // dst_v
1474 mov ecx, [esp + 8 + 20] // pix 1474 mov ecx, [esp + 8 + 20] // width
1475 movdqa xmm5, xmmword ptr kAddUVJ128 1475 movdqa xmm5, xmmword ptr kAddUVJ128
1476 movdqa xmm6, xmmword ptr kARGBToVJ 1476 movdqa xmm6, xmmword ptr kARGBToVJ
1477 movdqa xmm7, xmmword ptr kARGBToUJ 1477 movdqa xmm7, xmmword ptr kARGBToUJ
1478 sub edi, edx // stride from u to v 1478 sub edi, edx // stride from u to v
1479 1479
1480 convertloop: 1480 convertloop:
1481 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1481 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1482 movdqu xmm0, [eax] 1482 movdqu xmm0, [eax]
1483 movdqu xmm4, [eax + esi] 1483 movdqu xmm4, [eax + esi]
1484 pavgb xmm0, xmm4 1484 pavgb xmm0, xmm4
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
1536 __declspec(naked) 1536 __declspec(naked)
1537 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1537 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1538 uint8* dst_u, uint8* dst_v, int width) { 1538 uint8* dst_u, uint8* dst_v, int width) {
1539 __asm { 1539 __asm {
1540 push esi 1540 push esi
1541 push edi 1541 push edi
1542 mov eax, [esp + 8 + 4] // src_argb 1542 mov eax, [esp + 8 + 4] // src_argb
1543 mov esi, [esp + 8 + 8] // src_stride_argb 1543 mov esi, [esp + 8 + 8] // src_stride_argb
1544 mov edx, [esp + 8 + 12] // dst_u 1544 mov edx, [esp + 8 + 12] // dst_u
1545 mov edi, [esp + 8 + 16] // dst_v 1545 mov edi, [esp + 8 + 16] // dst_v
1546 mov ecx, [esp + 8 + 20] // pix 1546 mov ecx, [esp + 8 + 20] // width
1547 vbroadcastf128 ymm5, xmmword ptr kAddUV128 1547 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1548 vbroadcastf128 ymm6, xmmword ptr kARGBToV 1548 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1549 vbroadcastf128 ymm7, xmmword ptr kARGBToU 1549 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1550 sub edi, edx // stride from u to v 1550 sub edi, edx // stride from u to v
1551 1551
1552 convertloop: 1552 convertloop:
1553 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1553 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1554 vmovdqu ymm0, [eax] 1554 vmovdqu ymm0, [eax]
1555 vmovdqu ymm1, [eax + 32] 1555 vmovdqu ymm1, [eax + 32]
1556 vmovdqu ymm2, [eax + 64] 1556 vmovdqu ymm2, [eax + 64]
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
1599 #endif // HAS_ARGBTOUVROW_AVX2 1599 #endif // HAS_ARGBTOUVROW_AVX2
1600 1600
1601 __declspec(naked) 1601 __declspec(naked)
1602 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1602 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1603 uint8* dst_u, uint8* dst_v, int width) { 1603 uint8* dst_u, uint8* dst_v, int width) {
1604 __asm { 1604 __asm {
1605 push edi 1605 push edi
1606 mov eax, [esp + 4 + 4] // src_argb 1606 mov eax, [esp + 4 + 4] // src_argb
1607 mov edx, [esp + 4 + 8] // dst_u 1607 mov edx, [esp + 4 + 8] // dst_u
1608 mov edi, [esp + 4 + 12] // dst_v 1608 mov edi, [esp + 4 + 12] // dst_v
1609 mov ecx, [esp + 4 + 16] // pix 1609 mov ecx, [esp + 4 + 16] // width
1610 movdqa xmm5, xmmword ptr kAddUV128 1610 movdqa xmm5, xmmword ptr kAddUV128
1611 movdqa xmm6, xmmword ptr kARGBToV 1611 movdqa xmm6, xmmword ptr kARGBToV
1612 movdqa xmm7, xmmword ptr kARGBToU 1612 movdqa xmm7, xmmword ptr kARGBToU
1613 sub edi, edx // stride from u to v 1613 sub edi, edx // stride from u to v
1614 1614
1615 convertloop: 1615 convertloop:
1616 /* convert to U and V */ 1616 /* convert to U and V */
1617 movdqu xmm0, [eax] // U 1617 movdqu xmm0, [eax] // U
1618 movdqu xmm1, [eax + 16] 1618 movdqu xmm1, [eax + 16]
1619 movdqu xmm2, [eax + 32] 1619 movdqu xmm2, [eax + 32]
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
1656 } 1656 }
1657 1657
1658 __declspec(naked) 1658 __declspec(naked)
1659 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1659 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1660 uint8* dst_u, uint8* dst_v, int width) { 1660 uint8* dst_u, uint8* dst_v, int width) {
1661 __asm { 1661 __asm {
1662 push edi 1662 push edi
1663 mov eax, [esp + 4 + 4] // src_argb 1663 mov eax, [esp + 4 + 4] // src_argb
1664 mov edx, [esp + 4 + 8] // dst_u 1664 mov edx, [esp + 4 + 8] // dst_u
1665 mov edi, [esp + 4 + 12] // dst_v 1665 mov edi, [esp + 4 + 12] // dst_v
1666 mov ecx, [esp + 4 + 16] // pix 1666 mov ecx, [esp + 4 + 16] // width
1667 movdqa xmm5, xmmword ptr kAddUV128 1667 movdqa xmm5, xmmword ptr kAddUV128
1668 movdqa xmm6, xmmword ptr kARGBToV 1668 movdqa xmm6, xmmword ptr kARGBToV
1669 movdqa xmm7, xmmword ptr kARGBToU 1669 movdqa xmm7, xmmword ptr kARGBToU
1670 sub edi, edx // stride from u to v 1670 sub edi, edx // stride from u to v
1671 1671
1672 convertloop: 1672 convertloop:
1673 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1673 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1674 movdqu xmm0, [eax] 1674 movdqu xmm0, [eax]
1675 movdqu xmm1, [eax + 16] 1675 movdqu xmm1, [eax + 16]
1676 movdqu xmm2, [eax + 32] 1676 movdqu xmm2, [eax + 32]
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
1716 __declspec(naked) 1716 __declspec(naked)
1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1718 uint8* dst_u, uint8* dst_v, int width) { 1718 uint8* dst_u, uint8* dst_v, int width) {
1719 __asm { 1719 __asm {
1720 push esi 1720 push esi
1721 push edi 1721 push edi
1722 mov eax, [esp + 8 + 4] // src_argb 1722 mov eax, [esp + 8 + 4] // src_argb
1723 mov esi, [esp + 8 + 8] // src_stride_argb 1723 mov esi, [esp + 8 + 8] // src_stride_argb
1724 mov edx, [esp + 8 + 12] // dst_u 1724 mov edx, [esp + 8 + 12] // dst_u
1725 mov edi, [esp + 8 + 16] // dst_v 1725 mov edi, [esp + 8 + 16] // dst_v
1726 mov ecx, [esp + 8 + 20] // pix 1726 mov ecx, [esp + 8 + 20] // width
1727 movdqa xmm5, xmmword ptr kAddUV128 1727 movdqa xmm5, xmmword ptr kAddUV128
1728 movdqa xmm6, xmmword ptr kBGRAToV 1728 movdqa xmm6, xmmword ptr kBGRAToV
1729 movdqa xmm7, xmmword ptr kBGRAToU 1729 movdqa xmm7, xmmword ptr kBGRAToU
1730 sub edi, edx // stride from u to v 1730 sub edi, edx // stride from u to v
1731 1731
1732 convertloop: 1732 convertloop:
1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1734 movdqu xmm0, [eax] 1734 movdqu xmm0, [eax]
1735 movdqu xmm4, [eax + esi] 1735 movdqu xmm4, [eax + esi]
1736 pavgb xmm0, xmm4 1736 pavgb xmm0, xmm4
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1786 __declspec(naked) 1786 __declspec(naked)
1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1788 uint8* dst_u, uint8* dst_v, int width) { 1788 uint8* dst_u, uint8* dst_v, int width) {
1789 __asm { 1789 __asm {
1790 push esi 1790 push esi
1791 push edi 1791 push edi
1792 mov eax, [esp + 8 + 4] // src_argb 1792 mov eax, [esp + 8 + 4] // src_argb
1793 mov esi, [esp + 8 + 8] // src_stride_argb 1793 mov esi, [esp + 8 + 8] // src_stride_argb
1794 mov edx, [esp + 8 + 12] // dst_u 1794 mov edx, [esp + 8 + 12] // dst_u
1795 mov edi, [esp + 8 + 16] // dst_v 1795 mov edi, [esp + 8 + 16] // dst_v
1796 mov ecx, [esp + 8 + 20] // pix 1796 mov ecx, [esp + 8 + 20] // width
1797 movdqa xmm5, xmmword ptr kAddUV128 1797 movdqa xmm5, xmmword ptr kAddUV128
1798 movdqa xmm6, xmmword ptr kABGRToV 1798 movdqa xmm6, xmmword ptr kABGRToV
1799 movdqa xmm7, xmmword ptr kABGRToU 1799 movdqa xmm7, xmmword ptr kABGRToU
1800 sub edi, edx // stride from u to v 1800 sub edi, edx // stride from u to v
1801 1801
1802 convertloop: 1802 convertloop:
1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1804 movdqu xmm0, [eax] 1804 movdqu xmm0, [eax]
1805 movdqu xmm4, [eax + esi] 1805 movdqu xmm4, [eax + esi]
1806 pavgb xmm0, xmm4 1806 pavgb xmm0, xmm4
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1856 __declspec(naked) 1856 __declspec(naked)
1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1858 uint8* dst_u, uint8* dst_v, int width) { 1858 uint8* dst_u, uint8* dst_v, int width) {
1859 __asm { 1859 __asm {
1860 push esi 1860 push esi
1861 push edi 1861 push edi
1862 mov eax, [esp + 8 + 4] // src_argb 1862 mov eax, [esp + 8 + 4] // src_argb
1863 mov esi, [esp + 8 + 8] // src_stride_argb 1863 mov esi, [esp + 8 + 8] // src_stride_argb
1864 mov edx, [esp + 8 + 12] // dst_u 1864 mov edx, [esp + 8 + 12] // dst_u
1865 mov edi, [esp + 8 + 16] // dst_v 1865 mov edi, [esp + 8 + 16] // dst_v
1866 mov ecx, [esp + 8 + 20] // pix 1866 mov ecx, [esp + 8 + 20] // width
1867 movdqa xmm5, xmmword ptr kAddUV128 1867 movdqa xmm5, xmmword ptr kAddUV128
1868 movdqa xmm6, xmmword ptr kRGBAToV 1868 movdqa xmm6, xmmword ptr kRGBAToV
1869 movdqa xmm7, xmmword ptr kRGBAToU 1869 movdqa xmm7, xmmword ptr kRGBAToU
1870 sub edi, edx // stride from u to v 1870 sub edi, edx // stride from u to v
1871 1871
1872 convertloop: 1872 convertloop:
1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874 movdqu xmm0, [eax] 1874 movdqu xmm0, [eax]
1875 movdqu xmm4, [eax + esi] 1875 movdqu xmm4, [eax + esi]
1876 pavgb xmm0, xmm4 1876 pavgb xmm0, xmm4
(...skipping 1754 matching lines...) Expand 10 before | Expand all | Expand 10 after
3631 sub ecx, 8 3631 sub ecx, 8
3632 jg convertloop 3632 jg convertloop
3633 vzeroupper 3633 vzeroupper
3634 ret 3634 ret
3635 } 3635 }
3636 } 3636 }
3637 #endif // HAS_ARGBMIRRORROW_AVX2 3637 #endif // HAS_ARGBMIRRORROW_AVX2
3638 3638
3639 #ifdef HAS_SPLITUVROW_SSE2 3639 #ifdef HAS_SPLITUVROW_SSE2
3640 __declspec(naked) 3640 __declspec(naked)
3641 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3641 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
3642 __asm { 3642 __asm {
3643 push edi 3643 push edi
3644 mov eax, [esp + 4 + 4] // src_uv 3644 mov eax, [esp + 4 + 4] // src_uv
3645 mov edx, [esp + 4 + 8] // dst_u 3645 mov edx, [esp + 4 + 8] // dst_u
3646 mov edi, [esp + 4 + 12] // dst_v 3646 mov edi, [esp + 4 + 12] // dst_v
3647 mov ecx, [esp + 4 + 16] // pix 3647 mov ecx, [esp + 4 + 16] // width
3648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3649 psrlw xmm5, 8 3649 psrlw xmm5, 8
3650 sub edi, edx 3650 sub edi, edx
3651 3651
3652 convertloop: 3652 convertloop:
3653 movdqu xmm0, [eax] 3653 movdqu xmm0, [eax]
3654 movdqu xmm1, [eax + 16] 3654 movdqu xmm1, [eax + 16]
3655 lea eax, [eax + 32] 3655 lea eax, [eax + 32]
3656 movdqa xmm2, xmm0 3656 movdqa xmm2, xmm0
3657 movdqa xmm3, xmm1 3657 movdqa xmm3, xmm1
(...skipping 11 matching lines...) Expand all
3669 3669
3670 pop edi 3670 pop edi
3671 ret 3671 ret
3672 } 3672 }
3673 } 3673 }
3674 3674
3675 #endif // HAS_SPLITUVROW_SSE2 3675 #endif // HAS_SPLITUVROW_SSE2
3676 3676
3677 #ifdef HAS_SPLITUVROW_AVX2 3677 #ifdef HAS_SPLITUVROW_AVX2
3678 __declspec(naked) 3678 __declspec(naked)
3679 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3679 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
3680 __asm { 3680 __asm {
3681 push edi 3681 push edi
3682 mov eax, [esp + 4 + 4] // src_uv 3682 mov eax, [esp + 4 + 4] // src_uv
3683 mov edx, [esp + 4 + 8] // dst_u 3683 mov edx, [esp + 4 + 8] // dst_u
3684 mov edi, [esp + 4 + 12] // dst_v 3684 mov edi, [esp + 4 + 12] // dst_v
3685 mov ecx, [esp + 4 + 16] // pix 3685 mov ecx, [esp + 4 + 16] // width
3686 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3686 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3687 vpsrlw ymm5, ymm5, 8 3687 vpsrlw ymm5, ymm5, 8
3688 sub edi, edx 3688 sub edi, edx
3689 3689
3690 convertloop: 3690 convertloop:
3691 vmovdqu ymm0, [eax] 3691 vmovdqu ymm0, [eax]
3692 vmovdqu ymm1, [eax + 32] 3692 vmovdqu ymm1, [eax + 32]
3693 lea eax, [eax + 64] 3693 lea eax, [eax + 64]
3694 vpsrlw ymm2, ymm0, 8 // odd bytes 3694 vpsrlw ymm2, ymm0, 8 // odd bytes
3695 vpsrlw ymm3, ymm1, 8 3695 vpsrlw ymm3, ymm1, 8
(...skipping 321 matching lines...) Expand 10 before | Expand all | Expand 10 after
4017 mov ecx, [esp + 12] // count 4017 mov ecx, [esp + 12] // count
4018 rep stosd 4018 rep stosd
4019 mov edi, edx 4019 mov edi, edx
4020 ret 4020 ret
4021 } 4021 }
4022 } 4022 }
4023 #endif // HAS_SETROW_X86 4023 #endif // HAS_SETROW_X86
4024 4024
4025 #ifdef HAS_YUY2TOYROW_AVX2 4025 #ifdef HAS_YUY2TOYROW_AVX2
4026 __declspec(naked) 4026 __declspec(naked)
4027 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { 4027 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
4028 __asm { 4028 __asm {
4029 mov eax, [esp + 4] // src_yuy2 4029 mov eax, [esp + 4] // src_yuy2
4030 mov edx, [esp + 8] // dst_y 4030 mov edx, [esp + 8] // dst_y
4031 mov ecx, [esp + 12] // pix 4031 mov ecx, [esp + 12] // width
4032 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4032 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4033 vpsrlw ymm5, ymm5, 8 4033 vpsrlw ymm5, ymm5, 8
4034 4034
4035 convertloop: 4035 convertloop:
4036 vmovdqu ymm0, [eax] 4036 vmovdqu ymm0, [eax]
4037 vmovdqu ymm1, [eax + 32] 4037 vmovdqu ymm1, [eax + 32]
4038 lea eax, [eax + 64] 4038 lea eax, [eax + 64]
4039 vpand ymm0, ymm0, ymm5 // even bytes are Y 4039 vpand ymm0, ymm0, ymm5 // even bytes are Y
4040 vpand ymm1, ymm1, ymm5 4040 vpand ymm1, ymm1, ymm5
4041 vpackuswb ymm0, ymm0, ymm1 // mutates. 4041 vpackuswb ymm0, ymm0, ymm1 // mutates.
4042 vpermq ymm0, ymm0, 0xd8 4042 vpermq ymm0, ymm0, 0xd8
4043 vmovdqu [edx], ymm0 4043 vmovdqu [edx], ymm0
4044 lea edx, [edx + 32] 4044 lea edx, [edx + 32]
4045 sub ecx, 32 4045 sub ecx, 32
4046 jg convertloop 4046 jg convertloop
4047 vzeroupper 4047 vzeroupper
4048 ret 4048 ret
4049 } 4049 }
4050 } 4050 }
4051 4051
4052 __declspec(naked) 4052 __declspec(naked)
4053 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 4053 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
4054 uint8* dst_u, uint8* dst_v, int pix) { 4054 uint8* dst_u, uint8* dst_v, int width) {
4055 __asm { 4055 __asm {
4056 push esi 4056 push esi
4057 push edi 4057 push edi
4058 mov eax, [esp + 8 + 4] // src_yuy2 4058 mov eax, [esp + 8 + 4] // src_yuy2
4059 mov esi, [esp + 8 + 8] // stride_yuy2 4059 mov esi, [esp + 8 + 8] // stride_yuy2
4060 mov edx, [esp + 8 + 12] // dst_u 4060 mov edx, [esp + 8 + 12] // dst_u
4061 mov edi, [esp + 8 + 16] // dst_v 4061 mov edi, [esp + 8 + 16] // dst_v
4062 mov ecx, [esp + 8 + 20] // pix 4062 mov ecx, [esp + 8 + 20] // width
4063 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4063 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4064 vpsrlw ymm5, ymm5, 8 4064 vpsrlw ymm5, ymm5, 8
4065 sub edi, edx 4065 sub edi, edx
4066 4066
4067 convertloop: 4067 convertloop:
4068 vmovdqu ymm0, [eax] 4068 vmovdqu ymm0, [eax]
4069 vmovdqu ymm1, [eax + 32] 4069 vmovdqu ymm1, [eax + 32]
4070 vpavgb ymm0, ymm0, [eax + esi] 4070 vpavgb ymm0, ymm0, [eax + esi]
4071 vpavgb ymm1, ymm1, [eax + esi + 32] 4071 vpavgb ymm1, ymm1, [eax + esi + 32]
4072 lea eax, [eax + 64] 4072 lea eax, [eax + 64]
(...skipping 15 matching lines...) Expand all
4088 4088
4089 pop edi 4089 pop edi
4090 pop esi 4090 pop esi
4091 vzeroupper 4091 vzeroupper
4092 ret 4092 ret
4093 } 4093 }
4094 } 4094 }
4095 4095
4096 __declspec(naked) 4096 __declspec(naked)
4097 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 4097 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
4098 uint8* dst_u, uint8* dst_v, int pix) { 4098 uint8* dst_u, uint8* dst_v, int width) {
4099 __asm { 4099 __asm {
4100 push edi 4100 push edi
4101 mov eax, [esp + 4 + 4] // src_yuy2 4101 mov eax, [esp + 4 + 4] // src_yuy2
4102 mov edx, [esp + 4 + 8] // dst_u 4102 mov edx, [esp + 4 + 8] // dst_u
4103 mov edi, [esp + 4 + 12] // dst_v 4103 mov edi, [esp + 4 + 12] // dst_v
4104 mov ecx, [esp + 4 + 16] // pix 4104 mov ecx, [esp + 4 + 16] // width
4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4106 vpsrlw ymm5, ymm5, 8 4106 vpsrlw ymm5, ymm5, 8
4107 sub edi, edx 4107 sub edi, edx
4108 4108
4109 convertloop: 4109 convertloop:
4110 vmovdqu ymm0, [eax] 4110 vmovdqu ymm0, [eax]
4111 vmovdqu ymm1, [eax + 32] 4111 vmovdqu ymm1, [eax + 32]
4112 lea eax, [eax + 64] 4112 lea eax, [eax + 64]
4113 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 4113 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
4114 vpsrlw ymm1, ymm1, 8 4114 vpsrlw ymm1, ymm1, 8
(...skipping 12 matching lines...) Expand all
4127 jg convertloop 4127 jg convertloop
4128 4128
4129 pop edi 4129 pop edi
4130 vzeroupper 4130 vzeroupper
4131 ret 4131 ret
4132 } 4132 }
4133 } 4133 }
4134 4134
4135 __declspec(naked) 4135 __declspec(naked)
4136 void UYVYToYRow_AVX2(const uint8* src_uyvy, 4136 void UYVYToYRow_AVX2(const uint8* src_uyvy,
4137 uint8* dst_y, int pix) { 4137 uint8* dst_y, int width) {
4138 __asm { 4138 __asm {
4139 mov eax, [esp + 4] // src_uyvy 4139 mov eax, [esp + 4] // src_uyvy
4140 mov edx, [esp + 8] // dst_y 4140 mov edx, [esp + 8] // dst_y
4141 mov ecx, [esp + 12] // pix 4141 mov ecx, [esp + 12] // width
4142 4142
4143 convertloop: 4143 convertloop:
4144 vmovdqu ymm0, [eax] 4144 vmovdqu ymm0, [eax]
4145 vmovdqu ymm1, [eax + 32] 4145 vmovdqu ymm1, [eax + 32]
4146 lea eax, [eax + 64] 4146 lea eax, [eax + 64]
4147 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 4147 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
4148 vpsrlw ymm1, ymm1, 8 4148 vpsrlw ymm1, ymm1, 8
4149 vpackuswb ymm0, ymm0, ymm1 // mutates. 4149 vpackuswb ymm0, ymm0, ymm1 // mutates.
4150 vpermq ymm0, ymm0, 0xd8 4150 vpermq ymm0, ymm0, 0xd8
4151 vmovdqu [edx], ymm0 4151 vmovdqu [edx], ymm0
4152 lea edx, [edx + 32] 4152 lea edx, [edx + 32]
4153 sub ecx, 32 4153 sub ecx, 32
4154 jg convertloop 4154 jg convertloop
4155 vzeroupper 4155 vzeroupper
4156 ret 4156 ret
4157 } 4157 }
4158 } 4158 }
4159 4159
4160 __declspec(naked) 4160 __declspec(naked)
4161 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 4161 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
4162 uint8* dst_u, uint8* dst_v, int pix) { 4162 uint8* dst_u, uint8* dst_v, int width) {
4163 __asm { 4163 __asm {
4164 push esi 4164 push esi
4165 push edi 4165 push edi
4166 mov eax, [esp + 8 + 4] // src_yuy2 4166 mov eax, [esp + 8 + 4] // src_yuy2
4167 mov esi, [esp + 8 + 8] // stride_yuy2 4167 mov esi, [esp + 8 + 8] // stride_yuy2
4168 mov edx, [esp + 8 + 12] // dst_u 4168 mov edx, [esp + 8 + 12] // dst_u
4169 mov edi, [esp + 8 + 16] // dst_v 4169 mov edi, [esp + 8 + 16] // dst_v
4170 mov ecx, [esp + 8 + 20] // pix 4170 mov ecx, [esp + 8 + 20] // width
4171 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4171 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4172 vpsrlw ymm5, ymm5, 8 4172 vpsrlw ymm5, ymm5, 8
4173 sub edi, edx 4173 sub edi, edx
4174 4174
4175 convertloop: 4175 convertloop:
4176 vmovdqu ymm0, [eax] 4176 vmovdqu ymm0, [eax]
4177 vmovdqu ymm1, [eax + 32] 4177 vmovdqu ymm1, [eax + 32]
4178 vpavgb ymm0, ymm0, [eax + esi] 4178 vpavgb ymm0, ymm0, [eax + esi]
4179 vpavgb ymm1, ymm1, [eax + esi + 32] 4179 vpavgb ymm1, ymm1, [eax + esi + 32]
4180 lea eax, [eax + 64] 4180 lea eax, [eax + 64]
(...skipping 15 matching lines...) Expand all
4196 4196
4197 pop edi 4197 pop edi
4198 pop esi 4198 pop esi
4199 vzeroupper 4199 vzeroupper
4200 ret 4200 ret
4201 } 4201 }
4202 } 4202 }
4203 4203
4204 __declspec(naked) 4204 __declspec(naked)
4205 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 4205 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
4206 uint8* dst_u, uint8* dst_v, int pix) { 4206 uint8* dst_u, uint8* dst_v, int width) {
4207 __asm { 4207 __asm {
4208 push edi 4208 push edi
4209 mov eax, [esp + 4 + 4] // src_yuy2 4209 mov eax, [esp + 4 + 4] // src_yuy2
4210 mov edx, [esp + 4 + 8] // dst_u 4210 mov edx, [esp + 4 + 8] // dst_u
4211 mov edi, [esp + 4 + 12] // dst_v 4211 mov edi, [esp + 4 + 12] // dst_v
4212 mov ecx, [esp + 4 + 16] // pix 4212 mov ecx, [esp + 4 + 16] // width
4213 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4213 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4214 vpsrlw ymm5, ymm5, 8 4214 vpsrlw ymm5, ymm5, 8
4215 sub edi, edx 4215 sub edi, edx
4216 4216
4217 convertloop: 4217 convertloop:
4218 vmovdqu ymm0, [eax] 4218 vmovdqu ymm0, [eax]
4219 vmovdqu ymm1, [eax + 32] 4219 vmovdqu ymm1, [eax + 32]
4220 lea eax, [eax + 64] 4220 lea eax, [eax + 64]
4221 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 4221 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
4222 vpand ymm1, ymm1, ymm5 4222 vpand ymm1, ymm1, ymm5
(...skipping 14 matching lines...) Expand all
4237 pop edi 4237 pop edi
4238 vzeroupper 4238 vzeroupper
4239 ret 4239 ret
4240 } 4240 }
4241 } 4241 }
4242 #endif // HAS_YUY2TOYROW_AVX2 4242 #endif // HAS_YUY2TOYROW_AVX2
4243 4243
4244 #ifdef HAS_YUY2TOYROW_SSE2 4244 #ifdef HAS_YUY2TOYROW_SSE2
4245 __declspec(naked) 4245 __declspec(naked)
4246 void YUY2ToYRow_SSE2(const uint8* src_yuy2, 4246 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
4247 uint8* dst_y, int pix) { 4247 uint8* dst_y, int width) {
4248 __asm { 4248 __asm {
4249 mov eax, [esp + 4] // src_yuy2 4249 mov eax, [esp + 4] // src_yuy2
4250 mov edx, [esp + 8] // dst_y 4250 mov edx, [esp + 8] // dst_y
4251 mov ecx, [esp + 12] // pix 4251 mov ecx, [esp + 12] // width
4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4253 psrlw xmm5, 8 4253 psrlw xmm5, 8
4254 4254
4255 convertloop: 4255 convertloop:
4256 movdqu xmm0, [eax] 4256 movdqu xmm0, [eax]
4257 movdqu xmm1, [eax + 16] 4257 movdqu xmm1, [eax + 16]
4258 lea eax, [eax + 32] 4258 lea eax, [eax + 32]
4259 pand xmm0, xmm5 // even bytes are Y 4259 pand xmm0, xmm5 // even bytes are Y
4260 pand xmm1, xmm5 4260 pand xmm1, xmm5
4261 packuswb xmm0, xmm1 4261 packuswb xmm0, xmm1
4262 movdqu [edx], xmm0 4262 movdqu [edx], xmm0
4263 lea edx, [edx + 16] 4263 lea edx, [edx + 16]
4264 sub ecx, 16 4264 sub ecx, 16
4265 jg convertloop 4265 jg convertloop
4266 ret 4266 ret
4267 } 4267 }
4268 } 4268 }
4269 4269
4270 __declspec(naked) 4270 __declspec(naked)
4271 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 4271 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
4272 uint8* dst_u, uint8* dst_v, int pix) { 4272 uint8* dst_u, uint8* dst_v, int width) {
4273 __asm { 4273 __asm {
4274 push esi 4274 push esi
4275 push edi 4275 push edi
4276 mov eax, [esp + 8 + 4] // src_yuy2 4276 mov eax, [esp + 8 + 4] // src_yuy2
4277 mov esi, [esp + 8 + 8] // stride_yuy2 4277 mov esi, [esp + 8 + 8] // stride_yuy2
4278 mov edx, [esp + 8 + 12] // dst_u 4278 mov edx, [esp + 8 + 12] // dst_u
4279 mov edi, [esp + 8 + 16] // dst_v 4279 mov edi, [esp + 8 + 16] // dst_v
4280 mov ecx, [esp + 8 + 20] // pix 4280 mov ecx, [esp + 8 + 20] // width
4281 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4281 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4282 psrlw xmm5, 8 4282 psrlw xmm5, 8
4283 sub edi, edx 4283 sub edi, edx
4284 4284
4285 convertloop: 4285 convertloop:
4286 movdqu xmm0, [eax] 4286 movdqu xmm0, [eax]
4287 movdqu xmm1, [eax + 16] 4287 movdqu xmm1, [eax + 16]
4288 movdqu xmm2, [eax + esi] 4288 movdqu xmm2, [eax + esi]
4289 movdqu xmm3, [eax + esi + 16] 4289 movdqu xmm3, [eax + esi + 16]
4290 lea eax, [eax + 32] 4290 lea eax, [eax + 32]
(...skipping 14 matching lines...) Expand all
4305 jg convertloop 4305 jg convertloop
4306 4306
4307 pop edi 4307 pop edi
4308 pop esi 4308 pop esi
4309 ret 4309 ret
4310 } 4310 }
4311 } 4311 }
4312 4312
4313 __declspec(naked) 4313 __declspec(naked)
4314 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 4314 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
4315 uint8* dst_u, uint8* dst_v, int pix) { 4315 uint8* dst_u, uint8* dst_v, int width) {
4316 __asm { 4316 __asm {
4317 push edi 4317 push edi
4318 mov eax, [esp + 4 + 4] // src_yuy2 4318 mov eax, [esp + 4 + 4] // src_yuy2
4319 mov edx, [esp + 4 + 8] // dst_u 4319 mov edx, [esp + 4 + 8] // dst_u
4320 mov edi, [esp + 4 + 12] // dst_v 4320 mov edi, [esp + 4 + 12] // dst_v
4321 mov ecx, [esp + 4 + 16] // pix 4321 mov ecx, [esp + 4 + 16] // width
4322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4323 psrlw xmm5, 8 4323 psrlw xmm5, 8
4324 sub edi, edx 4324 sub edi, edx
4325 4325
4326 convertloop: 4326 convertloop:
4327 movdqu xmm0, [eax] 4327 movdqu xmm0, [eax]
4328 movdqu xmm1, [eax + 16] 4328 movdqu xmm1, [eax + 16]
4329 lea eax, [eax + 32] 4329 lea eax, [eax + 32]
4330 psrlw xmm0, 8 // YUYV -> UVUV 4330 psrlw xmm0, 8 // YUYV -> UVUV
4331 psrlw xmm1, 8 4331 psrlw xmm1, 8
4332 packuswb xmm0, xmm1 4332 packuswb xmm0, xmm1
4333 movdqa xmm1, xmm0 4333 movdqa xmm1, xmm0
4334 pand xmm0, xmm5 // U 4334 pand xmm0, xmm5 // U
4335 packuswb xmm0, xmm0 4335 packuswb xmm0, xmm0
4336 psrlw xmm1, 8 // V 4336 psrlw xmm1, 8 // V
4337 packuswb xmm1, xmm1 4337 packuswb xmm1, xmm1
4338 movq qword ptr [edx], xmm0 4338 movq qword ptr [edx], xmm0
4339 movq qword ptr [edx + edi], xmm1 4339 movq qword ptr [edx + edi], xmm1
4340 lea edx, [edx + 8] 4340 lea edx, [edx + 8]
4341 sub ecx, 16 4341 sub ecx, 16
4342 jg convertloop 4342 jg convertloop
4343 4343
4344 pop edi 4344 pop edi
4345 ret 4345 ret
4346 } 4346 }
4347 } 4347 }
4348 4348
4349 __declspec(naked) 4349 __declspec(naked)
4350 void UYVYToYRow_SSE2(const uint8* src_uyvy, 4350 void UYVYToYRow_SSE2(const uint8* src_uyvy,
4351 uint8* dst_y, int pix) { 4351 uint8* dst_y, int width) {
4352 __asm { 4352 __asm {
4353 mov eax, [esp + 4] // src_uyvy 4353 mov eax, [esp + 4] // src_uyvy
4354 mov edx, [esp + 8] // dst_y 4354 mov edx, [esp + 8] // dst_y
4355 mov ecx, [esp + 12] // pix 4355 mov ecx, [esp + 12] // width
4356 4356
4357 convertloop: 4357 convertloop:
4358 movdqu xmm0, [eax] 4358 movdqu xmm0, [eax]
4359 movdqu xmm1, [eax + 16] 4359 movdqu xmm1, [eax + 16]
4360 lea eax, [eax + 32] 4360 lea eax, [eax + 32]
4361 psrlw xmm0, 8 // odd bytes are Y 4361 psrlw xmm0, 8 // odd bytes are Y
4362 psrlw xmm1, 8 4362 psrlw xmm1, 8
4363 packuswb xmm0, xmm1 4363 packuswb xmm0, xmm1
4364 movdqu [edx], xmm0 4364 movdqu [edx], xmm0
4365 lea edx, [edx + 16] 4365 lea edx, [edx + 16]
4366 sub ecx, 16 4366 sub ecx, 16
4367 jg convertloop 4367 jg convertloop
4368 ret 4368 ret
4369 } 4369 }
4370 } 4370 }
4371 4371
4372 __declspec(naked) 4372 __declspec(naked)
4373 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 4373 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4374 uint8* dst_u, uint8* dst_v, int pix) { 4374 uint8* dst_u, uint8* dst_v, int width) {
4375 __asm { 4375 __asm {
4376 push esi 4376 push esi
4377 push edi 4377 push edi
4378 mov eax, [esp + 8 + 4] // src_yuy2 4378 mov eax, [esp + 8 + 4] // src_yuy2
4379 mov esi, [esp + 8 + 8] // stride_yuy2 4379 mov esi, [esp + 8 + 8] // stride_yuy2
4380 mov edx, [esp + 8 + 12] // dst_u 4380 mov edx, [esp + 8 + 12] // dst_u
4381 mov edi, [esp + 8 + 16] // dst_v 4381 mov edi, [esp + 8 + 16] // dst_v
4382 mov ecx, [esp + 8 + 20] // pix 4382 mov ecx, [esp + 8 + 20] // width
4383 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4383 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4384 psrlw xmm5, 8 4384 psrlw xmm5, 8
4385 sub edi, edx 4385 sub edi, edx
4386 4386
4387 convertloop: 4387 convertloop:
4388 movdqu xmm0, [eax] 4388 movdqu xmm0, [eax]
4389 movdqu xmm1, [eax + 16] 4389 movdqu xmm1, [eax + 16]
4390 movdqu xmm2, [eax + esi] 4390 movdqu xmm2, [eax + esi]
4391 movdqu xmm3, [eax + esi + 16] 4391 movdqu xmm3, [eax + esi + 16]
4392 lea eax, [eax + 32] 4392 lea eax, [eax + 32]
(...skipping 14 matching lines...) Expand all
4407 jg convertloop 4407 jg convertloop
4408 4408
4409 pop edi 4409 pop edi
4410 pop esi 4410 pop esi
4411 ret 4411 ret
4412 } 4412 }
4413 } 4413 }
4414 4414
4415 __declspec(naked) 4415 __declspec(naked)
4416 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 4416 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4417 uint8* dst_u, uint8* dst_v, int pix) { 4417 uint8* dst_u, uint8* dst_v, int width) {
4418 __asm { 4418 __asm {
4419 push edi 4419 push edi
4420 mov eax, [esp + 4 + 4] // src_yuy2 4420 mov eax, [esp + 4 + 4] // src_yuy2
4421 mov edx, [esp + 4 + 8] // dst_u 4421 mov edx, [esp + 4 + 8] // dst_u
4422 mov edi, [esp + 4 + 12] // dst_v 4422 mov edi, [esp + 4 + 12] // dst_v
4423 mov ecx, [esp + 4 + 16] // pix 4423 mov ecx, [esp + 4 + 16] // width
4424 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4424 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4425 psrlw xmm5, 8 4425 psrlw xmm5, 8
4426 sub edi, edx 4426 sub edi, edx
4427 4427
4428 convertloop: 4428 convertloop:
4429 movdqu xmm0, [eax] 4429 movdqu xmm0, [eax]
4430 movdqu xmm1, [eax + 16] 4430 movdqu xmm1, [eax + 16]
4431 lea eax, [eax + 32] 4431 lea eax, [eax + 32]
4432 pand xmm0, xmm5 // UYVY -> UVUV 4432 pand xmm0, xmm5 // UYVY -> UVUV
4433 pand xmm1, xmm5 4433 pand xmm1, xmm5
(...skipping 1685 matching lines...) Expand 10 before | Expand all | Expand 10 after
6119 pop edi 6119 pop edi
6120 pop esi 6120 pop esi
6121 ret 6121 ret
6122 } 6122 }
6123 } 6123 }
6124 #endif // HAS_INTERPOLATEROW_SSE2 6124 #endif // HAS_INTERPOLATEROW_SSE2
6125 6125
6126 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 6126 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
6127 __declspec(naked) 6127 __declspec(naked)
6128 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6128 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6129 const uint8* shuffler, int pix) { 6129 const uint8* shuffler, int width) {
6130 __asm { 6130 __asm {
6131 mov eax, [esp + 4] // src_argb 6131 mov eax, [esp + 4] // src_argb
6132 mov edx, [esp + 8] // dst_argb 6132 mov edx, [esp + 8] // dst_argb
6133 mov ecx, [esp + 12] // shuffler 6133 mov ecx, [esp + 12] // shuffler
6134 movdqu xmm5, [ecx] 6134 movdqu xmm5, [ecx]
6135 mov ecx, [esp + 16] // pix 6135 mov ecx, [esp + 16] // width
6136 6136
6137 wloop: 6137 wloop:
6138 movdqu xmm0, [eax] 6138 movdqu xmm0, [eax]
6139 movdqu xmm1, [eax + 16] 6139 movdqu xmm1, [eax + 16]
6140 lea eax, [eax + 32] 6140 lea eax, [eax + 32]
6141 pshufb xmm0, xmm5 6141 pshufb xmm0, xmm5
6142 pshufb xmm1, xmm5 6142 pshufb xmm1, xmm5
6143 movdqu [edx], xmm0 6143 movdqu [edx], xmm0
6144 movdqu [edx + 16], xmm1 6144 movdqu [edx + 16], xmm1
6145 lea edx, [edx + 32] 6145 lea edx, [edx + 32]
6146 sub ecx, 8 6146 sub ecx, 8
6147 jg wloop 6147 jg wloop
6148 ret 6148 ret
6149 } 6149 }
6150 } 6150 }
6151 6151
6152 #ifdef HAS_ARGBSHUFFLEROW_AVX2 6152 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6153 __declspec(naked) 6153 __declspec(naked)
6154 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 6154 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
6155 const uint8* shuffler, int pix) { 6155 const uint8* shuffler, int width) {
6156 __asm { 6156 __asm {
6157 mov eax, [esp + 4] // src_argb 6157 mov eax, [esp + 4] // src_argb
6158 mov edx, [esp + 8] // dst_argb 6158 mov edx, [esp + 8] // dst_argb
6159 mov ecx, [esp + 12] // shuffler 6159 mov ecx, [esp + 12] // shuffler
6160 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 6160 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
6161 mov ecx, [esp + 16] // pix 6161 mov ecx, [esp + 16] // width
6162 6162
6163 wloop: 6163 wloop:
6164 vmovdqu ymm0, [eax] 6164 vmovdqu ymm0, [eax]
6165 vmovdqu ymm1, [eax + 32] 6165 vmovdqu ymm1, [eax + 32]
6166 lea eax, [eax + 64] 6166 lea eax, [eax + 64]
6167 vpshufb ymm0, ymm0, ymm5 6167 vpshufb ymm0, ymm0, ymm5
6168 vpshufb ymm1, ymm1, ymm5 6168 vpshufb ymm1, ymm1, ymm5
6169 vmovdqu [edx], ymm0 6169 vmovdqu [edx], ymm0
6170 vmovdqu [edx + 32], ymm1 6170 vmovdqu [edx + 32], ymm1
6171 lea edx, [edx + 64] 6171 lea edx, [edx + 64]
6172 sub ecx, 16 6172 sub ecx, 16
6173 jg wloop 6173 jg wloop
6174 6174
6175 vzeroupper 6175 vzeroupper
6176 ret 6176 ret
6177 } 6177 }
6178 } 6178 }
6179 #endif // HAS_ARGBSHUFFLEROW_AVX2 6179 #endif // HAS_ARGBSHUFFLEROW_AVX2
6180 6180
6181 __declspec(naked) 6181 __declspec(naked)
6182 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 6182 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
6183 const uint8* shuffler, int pix) { 6183 const uint8* shuffler, int width) {
6184 __asm { 6184 __asm {
6185 push ebx 6185 push ebx
6186 push esi 6186 push esi
6187 mov eax, [esp + 8 + 4] // src_argb 6187 mov eax, [esp + 8 + 4] // src_argb
6188 mov edx, [esp + 8 + 8] // dst_argb 6188 mov edx, [esp + 8 + 8] // dst_argb
6189 mov esi, [esp + 8 + 12] // shuffler 6189 mov esi, [esp + 8 + 12] // shuffler
6190 mov ecx, [esp + 8 + 16] // pix 6190 mov ecx, [esp + 8 + 16] // width
6191 pxor xmm5, xmm5 6191 pxor xmm5, xmm5
6192 6192
6193 mov ebx, [esi] // shuffler 6193 mov ebx, [esi] // shuffler
6194 cmp ebx, 0x03000102 6194 cmp ebx, 0x03000102
6195 je shuf_3012 6195 je shuf_3012
6196 cmp ebx, 0x00010203 6196 cmp ebx, 0x00010203
6197 je shuf_0123 6197 je shuf_0123
6198 cmp ebx, 0x00030201 6198 cmp ebx, 0x00030201
6199 je shuf_0321 6199 je shuf_0321
6200 cmp ebx, 0x02010003 6200 cmp ebx, 0x02010003
(...skipping 435 matching lines...) Expand 10 before | Expand all | Expand 10 after
6636 } 6636 }
6637 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6637 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6638 6638
6639 #endif // defined(_M_X64) 6639 #endif // defined(_M_X64)
6640 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6640 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6641 6641
6642 #ifdef __cplusplus 6642 #ifdef __cplusplus
6643 } // extern "C" 6643 } // extern "C"
6644 } // namespace libyuv 6644 } // namespace libyuv
6645 #endif 6645 #endif
OLDNEW
« no previous file with comments | « source/row_neon64.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698