OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 309 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
320 }; | 320 }; |
321 | 321 |
322 // NV21 shuf 8 VU to 16 UV. | 322 // NV21 shuf 8 VU to 16 UV. |
323 static const lvec8 kShuffleNV21 = { | 323 static const lvec8 kShuffleNV21 = { |
324 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 324 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
325 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 325 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
326 }; | 326 }; |
327 | 327 |
328 // Duplicates gray value 3 times and fills in alpha opaque. | 328 // Duplicates gray value 3 times and fills in alpha opaque. |
329 __declspec(naked) | 329 __declspec(naked) |
330 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 330 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { |
331 __asm { | 331 __asm { |
332 mov eax, [esp + 4] // src_y | 332 mov eax, [esp + 4] // src_y |
333 mov edx, [esp + 8] // dst_argb | 333 mov edx, [esp + 8] // dst_argb |
334 mov ecx, [esp + 12] // pix | 334 mov ecx, [esp + 12] // width |
335 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 335 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
336 pslld xmm5, 24 | 336 pslld xmm5, 24 |
337 | 337 |
338 convertloop: | 338 convertloop: |
339 movq xmm0, qword ptr [eax] | 339 movq xmm0, qword ptr [eax] |
340 lea eax, [eax + 8] | 340 lea eax, [eax + 8] |
341 punpcklbw xmm0, xmm0 | 341 punpcklbw xmm0, xmm0 |
342 movdqa xmm1, xmm0 | 342 movdqa xmm1, xmm0 |
343 punpcklwd xmm0, xmm0 | 343 punpcklwd xmm0, xmm0 |
344 punpckhwd xmm1, xmm1 | 344 punpckhwd xmm1, xmm1 |
345 por xmm0, xmm5 | 345 por xmm0, xmm5 |
346 por xmm1, xmm5 | 346 por xmm1, xmm5 |
347 movdqu [edx], xmm0 | 347 movdqu [edx], xmm0 |
348 movdqu [edx + 16], xmm1 | 348 movdqu [edx + 16], xmm1 |
349 lea edx, [edx + 32] | 349 lea edx, [edx + 32] |
350 sub ecx, 8 | 350 sub ecx, 8 |
351 jg convertloop | 351 jg convertloop |
352 ret | 352 ret |
353 } | 353 } |
354 } | 354 } |
355 | 355 |
356 #ifdef HAS_J400TOARGBROW_AVX2 | 356 #ifdef HAS_J400TOARGBROW_AVX2 |
357 // Duplicates gray value 3 times and fills in alpha opaque. | 357 // Duplicates gray value 3 times and fills in alpha opaque. |
358 __declspec(naked) | 358 __declspec(naked) |
359 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { | 359 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { |
360 __asm { | 360 __asm { |
361 mov eax, [esp + 4] // src_y | 361 mov eax, [esp + 4] // src_y |
362 mov edx, [esp + 8] // dst_argb | 362 mov edx, [esp + 8] // dst_argb |
363 mov ecx, [esp + 12] // pix | 363 mov ecx, [esp + 12] // width |
364 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 | 364 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
365 vpslld ymm5, ymm5, 24 | 365 vpslld ymm5, ymm5, 24 |
366 | 366 |
367 convertloop: | 367 convertloop: |
368 vmovdqu xmm0, [eax] | 368 vmovdqu xmm0, [eax] |
369 lea eax, [eax + 16] | 369 lea eax, [eax + 16] |
370 vpermq ymm0, ymm0, 0xd8 | 370 vpermq ymm0, ymm0, 0xd8 |
371 vpunpcklbw ymm0, ymm0, ymm0 | 371 vpunpcklbw ymm0, ymm0, ymm0 |
372 vpermq ymm0, ymm0, 0xd8 | 372 vpermq ymm0, ymm0, 0xd8 |
373 vpunpckhwd ymm1, ymm0, ymm0 | 373 vpunpckhwd ymm1, ymm0, ymm0 |
374 vpunpcklwd ymm0, ymm0, ymm0 | 374 vpunpcklwd ymm0, ymm0, ymm0 |
375 vpor ymm0, ymm0, ymm5 | 375 vpor ymm0, ymm0, ymm5 |
376 vpor ymm1, ymm1, ymm5 | 376 vpor ymm1, ymm1, ymm5 |
377 vmovdqu [edx], ymm0 | 377 vmovdqu [edx], ymm0 |
378 vmovdqu [edx + 32], ymm1 | 378 vmovdqu [edx + 32], ymm1 |
379 lea edx, [edx + 64] | 379 lea edx, [edx + 64] |
380 sub ecx, 16 | 380 sub ecx, 16 |
381 jg convertloop | 381 jg convertloop |
382 vzeroupper | 382 vzeroupper |
383 ret | 383 ret |
384 } | 384 } |
385 } | 385 } |
386 #endif // HAS_J400TOARGBROW_AVX2 | 386 #endif // HAS_J400TOARGBROW_AVX2 |
387 | 387 |
388 __declspec(naked) | 388 __declspec(naked) |
389 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 389 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { |
390 __asm { | 390 __asm { |
391 mov eax, [esp + 4] // src_rgb24 | 391 mov eax, [esp + 4] // src_rgb24 |
392 mov edx, [esp + 8] // dst_argb | 392 mov edx, [esp + 8] // dst_argb |
393 mov ecx, [esp + 12] // pix | 393 mov ecx, [esp + 12] // width |
394 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 394 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
395 pslld xmm5, 24 | 395 pslld xmm5, 24 |
396 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB | 396 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB |
397 | 397 |
398 convertloop: | 398 convertloop: |
399 movdqu xmm0, [eax] | 399 movdqu xmm0, [eax] |
400 movdqu xmm1, [eax + 16] | 400 movdqu xmm1, [eax + 16] |
401 movdqu xmm3, [eax + 32] | 401 movdqu xmm3, [eax + 32] |
402 lea eax, [eax + 48] | 402 lea eax, [eax + 48] |
403 movdqa xmm2, xmm3 | 403 movdqa xmm2, xmm3 |
(...skipping 14 matching lines...) Expand all Loading... |
418 movdqu [edx + 48], xmm3 | 418 movdqu [edx + 48], xmm3 |
419 lea edx, [edx + 64] | 419 lea edx, [edx + 64] |
420 sub ecx, 16 | 420 sub ecx, 16 |
421 jg convertloop | 421 jg convertloop |
422 ret | 422 ret |
423 } | 423 } |
424 } | 424 } |
425 | 425 |
426 __declspec(naked) | 426 __declspec(naked) |
427 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, | 427 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
428 int pix) { | 428 int width) { |
429 __asm { | 429 __asm { |
430 mov eax, [esp + 4] // src_raw | 430 mov eax, [esp + 4] // src_raw |
431 mov edx, [esp + 8] // dst_argb | 431 mov edx, [esp + 8] // dst_argb |
432 mov ecx, [esp + 12] // pix | 432 mov ecx, [esp + 12] // width |
433 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 433 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
434 pslld xmm5, 24 | 434 pslld xmm5, 24 |
435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB | 435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB |
436 | 436 |
437 convertloop: | 437 convertloop: |
438 movdqu xmm0, [eax] | 438 movdqu xmm0, [eax] |
439 movdqu xmm1, [eax + 16] | 439 movdqu xmm1, [eax + 16] |
440 movdqu xmm3, [eax + 32] | 440 movdqu xmm3, [eax + 32] |
441 lea eax, [eax + 48] | 441 lea eax, [eax + 48] |
442 movdqa xmm2, xmm3 | 442 movdqa xmm2, xmm3 |
(...skipping 21 matching lines...) Expand all Loading... |
464 | 464 |
465 // pmul method to replicate bits. | 465 // pmul method to replicate bits. |
466 // Math to replicate bits: | 466 // Math to replicate bits: |
467 // (v << 8) | (v << 3) | 467 // (v << 8) | (v << 3) |
468 // v * 256 + v * 8 | 468 // v * 256 + v * 8 |
469 // v * (256 + 8) | 469 // v * (256 + 8) |
470 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 470 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
471 // 20 instructions. | 471 // 20 instructions. |
472 __declspec(naked) | 472 __declspec(naked) |
473 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, | 473 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
474 int pix) { | 474 int width) { |
475 __asm { | 475 __asm { |
476 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 476 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
477 movd xmm5, eax | 477 movd xmm5, eax |
478 pshufd xmm5, xmm5, 0 | 478 pshufd xmm5, xmm5, 0 |
479 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 479 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
480 movd xmm6, eax | 480 movd xmm6, eax |
481 pshufd xmm6, xmm6, 0 | 481 pshufd xmm6, xmm6, 0 |
482 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 482 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
483 psllw xmm3, 11 | 483 psllw xmm3, 11 |
484 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green | 484 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
485 psllw xmm4, 10 | 485 psllw xmm4, 10 |
486 psrlw xmm4, 5 | 486 psrlw xmm4, 5 |
487 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha | 487 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
488 psllw xmm7, 8 | 488 psllw xmm7, 8 |
489 | 489 |
490 mov eax, [esp + 4] // src_rgb565 | 490 mov eax, [esp + 4] // src_rgb565 |
491 mov edx, [esp + 8] // dst_argb | 491 mov edx, [esp + 8] // dst_argb |
492 mov ecx, [esp + 12] // pix | 492 mov ecx, [esp + 12] // width |
493 sub edx, eax | 493 sub edx, eax |
494 sub edx, eax | 494 sub edx, eax |
495 | 495 |
496 convertloop: | 496 convertloop: |
497 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 | 497 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
498 movdqa xmm1, xmm0 | 498 movdqa xmm1, xmm0 |
499 movdqa xmm2, xmm0 | 499 movdqa xmm2, xmm0 |
500 pand xmm1, xmm3 // R in upper 5 bits | 500 pand xmm1, xmm3 // R in upper 5 bits |
501 psllw xmm2, 11 // B in upper 5 bits | 501 psllw xmm2, 11 // B in upper 5 bits |
502 pmulhuw xmm1, xmm5 // * (256 + 8) | 502 pmulhuw xmm1, xmm5 // * (256 + 8) |
(...skipping 17 matching lines...) Expand all Loading... |
520 | 520 |
521 #ifdef HAS_RGB565TOARGBROW_AVX2 | 521 #ifdef HAS_RGB565TOARGBROW_AVX2 |
522 // pmul method to replicate bits. | 522 // pmul method to replicate bits. |
523 // Math to replicate bits: | 523 // Math to replicate bits: |
524 // (v << 8) | (v << 3) | 524 // (v << 8) | (v << 3) |
525 // v * 256 + v * 8 | 525 // v * 256 + v * 8 |
526 // v * (256 + 8) | 526 // v * (256 + 8) |
527 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 527 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
528 __declspec(naked) | 528 __declspec(naked) |
529 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, | 529 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, |
530 int pix) { | 530 int width) { |
531 __asm { | 531 __asm { |
532 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 532 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
533 vmovd xmm5, eax | 533 vmovd xmm5, eax |
534 vbroadcastss ymm5, xmm5 | 534 vbroadcastss ymm5, xmm5 |
535 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 535 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
536 movd xmm6, eax | 536 movd xmm6, eax |
537 vbroadcastss ymm6, xmm6 | 537 vbroadcastss ymm6, xmm6 |
538 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 538 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
539 vpsllw ymm3, ymm3, 11 | 539 vpsllw ymm3, ymm3, 11 |
540 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green | 540 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
541 vpsllw ymm4, ymm4, 10 | 541 vpsllw ymm4, ymm4, 10 |
542 vpsrlw ymm4, ymm4, 5 | 542 vpsrlw ymm4, ymm4, 5 |
543 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 543 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
544 vpsllw ymm7, ymm7, 8 | 544 vpsllw ymm7, ymm7, 8 |
545 | 545 |
546 mov eax, [esp + 4] // src_rgb565 | 546 mov eax, [esp + 4] // src_rgb565 |
547 mov edx, [esp + 8] // dst_argb | 547 mov edx, [esp + 8] // dst_argb |
548 mov ecx, [esp + 12] // pix | 548 mov ecx, [esp + 12] // width |
549 sub edx, eax | 549 sub edx, eax |
550 sub edx, eax | 550 sub edx, eax |
551 | 551 |
552 convertloop: | 552 convertloop: |
553 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 | 553 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 |
554 vpand ymm1, ymm0, ymm3 // R in upper 5 bits | 554 vpand ymm1, ymm0, ymm3 // R in upper 5 bits |
555 vpsllw ymm2, ymm0, 11 // B in upper 5 bits | 555 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
556 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) | 556 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
557 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) | 557 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
558 vpsllw ymm1, ymm1, 8 | 558 vpsllw ymm1, ymm1, 8 |
(...skipping 12 matching lines...) Expand all Loading... |
571 jg convertloop | 571 jg convertloop |
572 vzeroupper | 572 vzeroupper |
573 ret | 573 ret |
574 } | 574 } |
575 } | 575 } |
576 #endif // HAS_RGB565TOARGBROW_AVX2 | 576 #endif // HAS_RGB565TOARGBROW_AVX2 |
577 | 577 |
578 #ifdef HAS_ARGB1555TOARGBROW_AVX2 | 578 #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
579 __declspec(naked) | 579 __declspec(naked) |
580 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, | 580 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, |
581 int pix) { | 581 int width) { |
582 __asm { | 582 __asm { |
583 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 583 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
584 vmovd xmm5, eax | 584 vmovd xmm5, eax |
585 vbroadcastss ymm5, xmm5 | 585 vbroadcastss ymm5, xmm5 |
586 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 586 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
587 movd xmm6, eax | 587 movd xmm6, eax |
588 vbroadcastss ymm6, xmm6 | 588 vbroadcastss ymm6, xmm6 |
589 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 589 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
590 vpsllw ymm3, ymm3, 11 | 590 vpsllw ymm3, ymm3, 11 |
591 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green | 591 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
592 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 592 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
593 vpsllw ymm7, ymm7, 8 | 593 vpsllw ymm7, ymm7, 8 |
594 | 594 |
595 mov eax, [esp + 4] // src_argb1555 | 595 mov eax, [esp + 4] // src_argb1555 |
596 mov edx, [esp + 8] // dst_argb | 596 mov edx, [esp + 8] // dst_argb |
597 mov ecx, [esp + 12] // pix | 597 mov ecx, [esp + 12] // width |
598 sub edx, eax | 598 sub edx, eax |
599 sub edx, eax | 599 sub edx, eax |
600 | 600 |
601 convertloop: | 601 convertloop: |
602 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 | 602 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 |
603 vpsllw ymm1, ymm0, 1 // R in upper 5 bits | 603 vpsllw ymm1, ymm0, 1 // R in upper 5 bits |
604 vpsllw ymm2, ymm0, 11 // B in upper 5 bits | 604 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
605 vpand ymm1, ymm1, ymm3 | 605 vpand ymm1, ymm1, ymm3 |
606 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) | 606 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
607 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) | 607 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
(...skipping 15 matching lines...) Expand all Loading... |
623 jg convertloop | 623 jg convertloop |
624 vzeroupper | 624 vzeroupper |
625 ret | 625 ret |
626 } | 626 } |
627 } | 627 } |
628 #endif // HAS_ARGB1555TOARGBROW_AVX2 | 628 #endif // HAS_ARGB1555TOARGBROW_AVX2 |
629 | 629 |
630 #ifdef HAS_ARGB4444TOARGBROW_AVX2 | 630 #ifdef HAS_ARGB4444TOARGBROW_AVX2 |
631 __declspec(naked) | 631 __declspec(naked) |
632 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, | 632 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, |
633 int pix) { | 633 int width) { |
634 __asm { | 634 __asm { |
635 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f | 635 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
636 vmovd xmm4, eax | 636 vmovd xmm4, eax |
637 vbroadcastss ymm4, xmm4 | 637 vbroadcastss ymm4, xmm4 |
638 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles | 638 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles |
639 mov eax, [esp + 4] // src_argb4444 | 639 mov eax, [esp + 4] // src_argb4444 |
640 mov edx, [esp + 8] // dst_argb | 640 mov edx, [esp + 8] // dst_argb |
641 mov ecx, [esp + 12] // pix | 641 mov ecx, [esp + 12] // width |
642 sub edx, eax | 642 sub edx, eax |
643 sub edx, eax | 643 sub edx, eax |
644 | 644 |
645 convertloop: | 645 convertloop: |
646 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 | 646 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 |
647 vpand ymm2, ymm0, ymm5 // mask high nibbles | 647 vpand ymm2, ymm0, ymm5 // mask high nibbles |
648 vpand ymm0, ymm0, ymm4 // mask low nibbles | 648 vpand ymm0, ymm0, ymm4 // mask low nibbles |
649 vpsrlw ymm3, ymm2, 4 | 649 vpsrlw ymm3, ymm2, 4 |
650 vpsllw ymm1, ymm0, 4 | 650 vpsllw ymm1, ymm0, 4 |
651 vpor ymm2, ymm2, ymm3 | 651 vpor ymm2, ymm2, ymm3 |
652 vpor ymm0, ymm0, ymm1 | 652 vpor ymm0, ymm0, ymm1 |
653 vpermq ymm0, ymm0, 0xd8 // mutate for unpack | 653 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
654 vpermq ymm2, ymm2, 0xd8 | 654 vpermq ymm2, ymm2, 0xd8 |
655 vpunpckhbw ymm1, ymm0, ymm2 | 655 vpunpckhbw ymm1, ymm0, ymm2 |
656 vpunpcklbw ymm0, ymm0, ymm2 | 656 vpunpcklbw ymm0, ymm0, ymm2 |
657 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB | 657 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB |
658 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB | 658 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB |
659 lea eax, [eax + 32] | 659 lea eax, [eax + 32] |
660 sub ecx, 16 | 660 sub ecx, 16 |
661 jg convertloop | 661 jg convertloop |
662 vzeroupper | 662 vzeroupper |
663 ret | 663 ret |
664 } | 664 } |
665 } | 665 } |
666 #endif // HAS_ARGB4444TOARGBROW_AVX2 | 666 #endif // HAS_ARGB4444TOARGBROW_AVX2 |
667 | 667 |
668 // 24 instructions | 668 // 24 instructions |
669 __declspec(naked) | 669 __declspec(naked) |
670 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, | 670 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
671 int pix) { | 671 int width) { |
672 __asm { | 672 __asm { |
673 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 673 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
674 movd xmm5, eax | 674 movd xmm5, eax |
675 pshufd xmm5, xmm5, 0 | 675 pshufd xmm5, xmm5, 0 |
676 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 676 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
677 movd xmm6, eax | 677 movd xmm6, eax |
678 pshufd xmm6, xmm6, 0 | 678 pshufd xmm6, xmm6, 0 |
679 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 679 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
680 psllw xmm3, 11 | 680 psllw xmm3, 11 |
681 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green | 681 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
682 psrlw xmm4, 6 | 682 psrlw xmm4, 6 |
683 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha | 683 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
684 psllw xmm7, 8 | 684 psllw xmm7, 8 |
685 | 685 |
686 mov eax, [esp + 4] // src_argb1555 | 686 mov eax, [esp + 4] // src_argb1555 |
687 mov edx, [esp + 8] // dst_argb | 687 mov edx, [esp + 8] // dst_argb |
688 mov ecx, [esp + 12] // pix | 688 mov ecx, [esp + 12] // width |
689 sub edx, eax | 689 sub edx, eax |
690 sub edx, eax | 690 sub edx, eax |
691 | 691 |
692 convertloop: | 692 convertloop: |
693 movdqu xmm0, [eax] // fetch 8 pixels of 1555 | 693 movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
694 movdqa xmm1, xmm0 | 694 movdqa xmm1, xmm0 |
695 movdqa xmm2, xmm0 | 695 movdqa xmm2, xmm0 |
696 psllw xmm1, 1 // R in upper 5 bits | 696 psllw xmm1, 1 // R in upper 5 bits |
697 psllw xmm2, 11 // B in upper 5 bits | 697 psllw xmm2, 11 // B in upper 5 bits |
698 pand xmm1, xmm3 | 698 pand xmm1, xmm3 |
(...skipping 15 matching lines...) Expand all Loading... |
714 lea eax, [eax + 16] | 714 lea eax, [eax + 16] |
715 sub ecx, 8 | 715 sub ecx, 8 |
716 jg convertloop | 716 jg convertloop |
717 ret | 717 ret |
718 } | 718 } |
719 } | 719 } |
720 | 720 |
721 // 18 instructions. | 721 // 18 instructions. |
722 __declspec(naked) | 722 __declspec(naked) |
723 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, | 723 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
724 int pix) { | 724 int width) { |
725 __asm { | 725 __asm { |
726 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f | 726 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
727 movd xmm4, eax | 727 movd xmm4, eax |
728 pshufd xmm4, xmm4, 0 | 728 pshufd xmm4, xmm4, 0 |
729 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles | 729 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
730 pslld xmm5, 4 | 730 pslld xmm5, 4 |
731 mov eax, [esp + 4] // src_argb4444 | 731 mov eax, [esp + 4] // src_argb4444 |
732 mov edx, [esp + 8] // dst_argb | 732 mov edx, [esp + 8] // dst_argb |
733 mov ecx, [esp + 12] // pix | 733 mov ecx, [esp + 12] // width |
734 sub edx, eax | 734 sub edx, eax |
735 sub edx, eax | 735 sub edx, eax |
736 | 736 |
737 convertloop: | 737 convertloop: |
738 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 | 738 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
739 movdqa xmm2, xmm0 | 739 movdqa xmm2, xmm0 |
740 pand xmm0, xmm4 // mask low nibbles | 740 pand xmm0, xmm4 // mask low nibbles |
741 pand xmm2, xmm5 // mask high nibbles | 741 pand xmm2, xmm5 // mask high nibbles |
742 movdqa xmm1, xmm0 | 742 movdqa xmm1, xmm0 |
743 movdqa xmm3, xmm2 | 743 movdqa xmm3, xmm2 |
744 psllw xmm1, 4 | 744 psllw xmm1, 4 |
745 psrlw xmm3, 4 | 745 psrlw xmm3, 4 |
746 por xmm0, xmm1 | 746 por xmm0, xmm1 |
747 por xmm2, xmm3 | 747 por xmm2, xmm3 |
748 movdqa xmm1, xmm0 | 748 movdqa xmm1, xmm0 |
749 punpcklbw xmm0, xmm2 | 749 punpcklbw xmm0, xmm2 |
750 punpckhbw xmm1, xmm2 | 750 punpckhbw xmm1, xmm2 |
751 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB | 751 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
752 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB | 752 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
753 lea eax, [eax + 16] | 753 lea eax, [eax + 16] |
754 sub ecx, 8 | 754 sub ecx, 8 |
755 jg convertloop | 755 jg convertloop |
756 ret | 756 ret |
757 } | 757 } |
758 } | 758 } |
759 | 759 |
760 __declspec(naked) | 760 __declspec(naked) |
761 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 761 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { |
762 __asm { | 762 __asm { |
763 mov eax, [esp + 4] // src_argb | 763 mov eax, [esp + 4] // src_argb |
764 mov edx, [esp + 8] // dst_rgb | 764 mov edx, [esp + 8] // dst_rgb |
765 mov ecx, [esp + 12] // pix | 765 mov ecx, [esp + 12] // width |
766 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 | 766 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
767 | 767 |
768 convertloop: | 768 convertloop: |
769 movdqu xmm0, [eax] // fetch 16 pixels of argb | 769 movdqu xmm0, [eax] // fetch 16 pixels of argb |
770 movdqu xmm1, [eax + 16] | 770 movdqu xmm1, [eax + 16] |
771 movdqu xmm2, [eax + 32] | 771 movdqu xmm2, [eax + 32] |
772 movdqu xmm3, [eax + 48] | 772 movdqu xmm3, [eax + 48] |
773 lea eax, [eax + 64] | 773 lea eax, [eax + 64] |
774 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB | 774 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
775 pshufb xmm1, xmm6 | 775 pshufb xmm1, xmm6 |
(...skipping 13 matching lines...) Expand all Loading... |
789 movdqu [edx + 16], xmm1 // store 1 | 789 movdqu [edx + 16], xmm1 // store 1 |
790 movdqu [edx + 32], xmm2 // store 2 | 790 movdqu [edx + 32], xmm2 // store 2 |
791 lea edx, [edx + 48] | 791 lea edx, [edx + 48] |
792 sub ecx, 16 | 792 sub ecx, 16 |
793 jg convertloop | 793 jg convertloop |
794 ret | 794 ret |
795 } | 795 } |
796 } | 796 } |
797 | 797 |
798 __declspec(naked) | 798 __declspec(naked) |
799 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 799 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { |
800 __asm { | 800 __asm { |
801 mov eax, [esp + 4] // src_argb | 801 mov eax, [esp + 4] // src_argb |
802 mov edx, [esp + 8] // dst_rgb | 802 mov edx, [esp + 8] // dst_rgb |
803 mov ecx, [esp + 12] // pix | 803 mov ecx, [esp + 12] // width |
804 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW | 804 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
805 | 805 |
806 convertloop: | 806 convertloop: |
807 movdqu xmm0, [eax] // fetch 16 pixels of argb | 807 movdqu xmm0, [eax] // fetch 16 pixels of argb |
808 movdqu xmm1, [eax + 16] | 808 movdqu xmm1, [eax + 16] |
809 movdqu xmm2, [eax + 32] | 809 movdqu xmm2, [eax + 32] |
810 movdqu xmm3, [eax + 48] | 810 movdqu xmm3, [eax + 48] |
811 lea eax, [eax + 64] | 811 lea eax, [eax + 64] |
812 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB | 812 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
813 pshufb xmm1, xmm6 | 813 pshufb xmm1, xmm6 |
(...skipping 13 matching lines...) Expand all Loading... |
827 movdqu [edx + 16], xmm1 // store 1 | 827 movdqu [edx + 16], xmm1 // store 1 |
828 movdqu [edx + 32], xmm2 // store 2 | 828 movdqu [edx + 32], xmm2 // store 2 |
829 lea edx, [edx + 48] | 829 lea edx, [edx + 48] |
830 sub ecx, 16 | 830 sub ecx, 16 |
831 jg convertloop | 831 jg convertloop |
832 ret | 832 ret |
833 } | 833 } |
834 } | 834 } |
835 | 835 |
836 __declspec(naked) | 836 __declspec(naked) |
837 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 837 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { |
838 __asm { | 838 __asm { |
839 mov eax, [esp + 4] // src_argb | 839 mov eax, [esp + 4] // src_argb |
840 mov edx, [esp + 8] // dst_rgb | 840 mov edx, [esp + 8] // dst_rgb |
841 mov ecx, [esp + 12] // pix | 841 mov ecx, [esp + 12] // width |
842 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f | 842 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
843 psrld xmm3, 27 | 843 psrld xmm3, 27 |
844 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 | 844 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
845 psrld xmm4, 26 | 845 psrld xmm4, 26 |
846 pslld xmm4, 5 | 846 pslld xmm4, 5 |
847 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 | 847 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
848 pslld xmm5, 11 | 848 pslld xmm5, 11 |
849 | 849 |
850 convertloop: | 850 convertloop: |
851 movdqu xmm0, [eax] // fetch 4 pixels of argb | 851 movdqu xmm0, [eax] // fetch 4 pixels of argb |
(...skipping 13 matching lines...) Expand all Loading... |
865 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 | 865 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
866 lea edx, [edx + 8] | 866 lea edx, [edx + 8] |
867 sub ecx, 4 | 867 sub ecx, 4 |
868 jg convertloop | 868 jg convertloop |
869 ret | 869 ret |
870 } | 870 } |
871 } | 871 } |
872 | 872 |
873 __declspec(naked) | 873 __declspec(naked) |
874 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, | 874 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, |
875 const uint32 dither4, int pix) { | 875 const uint32 dither4, int width) { |
876 __asm { | 876 __asm { |
877 | 877 |
878 mov eax, [esp + 4] // src_argb | 878 mov eax, [esp + 4] // src_argb |
879 mov edx, [esp + 8] // dst_rgb | 879 mov edx, [esp + 8] // dst_rgb |
880 movd xmm6, [esp + 12] // dither4 | 880 movd xmm6, [esp + 12] // dither4 |
881 mov ecx, [esp + 16] // pix | 881 mov ecx, [esp + 16] // width |
882 punpcklbw xmm6, xmm6 // make dither 16 bytes | 882 punpcklbw xmm6, xmm6 // make dither 16 bytes |
883 movdqa xmm7, xmm6 | 883 movdqa xmm7, xmm6 |
884 punpcklwd xmm6, xmm6 | 884 punpcklwd xmm6, xmm6 |
885 punpckhwd xmm7, xmm7 | 885 punpckhwd xmm7, xmm7 |
886 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f | 886 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
887 psrld xmm3, 27 | 887 psrld xmm3, 27 |
888 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 | 888 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
889 psrld xmm4, 26 | 889 psrld xmm4, 26 |
890 pslld xmm4, 5 | 890 pslld xmm4, 5 |
891 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 | 891 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
(...skipping 19 matching lines...) Expand all Loading... |
911 lea edx, [edx + 8] | 911 lea edx, [edx + 8] |
912 sub ecx, 4 | 912 sub ecx, 4 |
913 jg convertloop | 913 jg convertloop |
914 ret | 914 ret |
915 } | 915 } |
916 } | 916 } |
917 | 917 |
918 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 | 918 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
919 __declspec(naked) | 919 __declspec(naked) |
920 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, | 920 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, |
921 const uint32 dither4, int pix) { | 921 const uint32 dither4, int width) { |
922 __asm { | 922 __asm { |
923 mov eax, [esp + 4] // src_argb | 923 mov eax, [esp + 4] // src_argb |
924 mov edx, [esp + 8] // dst_rgb | 924 mov edx, [esp + 8] // dst_rgb |
925 vbroadcastss xmm6, [esp + 12] // dither4 | 925 vbroadcastss xmm6, [esp + 12] // dither4 |
926 mov ecx, [esp + 16] // pix | 926 mov ecx, [esp + 16] // width |
927 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes | 927 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes |
928 vpermq ymm6, ymm6, 0xd8 | 928 vpermq ymm6, ymm6, 0xd8 |
929 vpunpcklwd ymm6, ymm6, ymm6 | 929 vpunpcklwd ymm6, ymm6, ymm6 |
930 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f | 930 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
931 vpsrld ymm3, ymm3, 27 | 931 vpsrld ymm3, ymm3, 27 |
932 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 | 932 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
933 vpsrld ymm4, ymm4, 26 | 933 vpsrld ymm4, ymm4, 26 |
934 vpslld ymm4, ymm4, 5 | 934 vpslld ymm4, ymm4, 5 |
935 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 | 935 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
936 | 936 |
(...skipping 16 matching lines...) Expand all Loading... |
953 sub ecx, 8 | 953 sub ecx, 8 |
954 jg convertloop | 954 jg convertloop |
955 vzeroupper | 955 vzeroupper |
956 ret | 956 ret |
957 } | 957 } |
958 } | 958 } |
959 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 | 959 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
960 | 960 |
961 // TODO(fbarchard): Improve sign extension/packing. | 961 // TODO(fbarchard): Improve sign extension/packing. |
962 __declspec(naked) | 962 __declspec(naked) |
963 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 963 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { |
964 __asm { | 964 __asm { |
965 mov eax, [esp + 4] // src_argb | 965 mov eax, [esp + 4] // src_argb |
966 mov edx, [esp + 8] // dst_rgb | 966 mov edx, [esp + 8] // dst_rgb |
967 mov ecx, [esp + 12] // pix | 967 mov ecx, [esp + 12] // width |
968 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f | 968 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
969 psrld xmm4, 27 | 969 psrld xmm4, 27 |
970 movdqa xmm5, xmm4 // generate mask 0x000003e0 | 970 movdqa xmm5, xmm4 // generate mask 0x000003e0 |
971 pslld xmm5, 5 | 971 pslld xmm5, 5 |
972 movdqa xmm6, xmm4 // generate mask 0x00007c00 | 972 movdqa xmm6, xmm4 // generate mask 0x00007c00 |
973 pslld xmm6, 10 | 973 pslld xmm6, 10 |
974 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 | 974 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
975 pslld xmm7, 15 | 975 pslld xmm7, 15 |
976 | 976 |
977 convertloop: | 977 convertloop: |
(...skipping 16 matching lines...) Expand all Loading... |
994 lea eax, [eax + 16] | 994 lea eax, [eax + 16] |
995 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 | 995 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
996 lea edx, [edx + 8] | 996 lea edx, [edx + 8] |
997 sub ecx, 4 | 997 sub ecx, 4 |
998 jg convertloop | 998 jg convertloop |
999 ret | 999 ret |
1000 } | 1000 } |
1001 } | 1001 } |
1002 | 1002 |
1003 __declspec(naked) | 1003 __declspec(naked) |
1004 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1004 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { |
1005 __asm { | 1005 __asm { |
1006 mov eax, [esp + 4] // src_argb | 1006 mov eax, [esp + 4] // src_argb |
1007 mov edx, [esp + 8] // dst_rgb | 1007 mov edx, [esp + 8] // dst_rgb |
1008 mov ecx, [esp + 12] // pix | 1008 mov ecx, [esp + 12] // width |
1009 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 | 1009 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
1010 psllw xmm4, 12 | 1010 psllw xmm4, 12 |
1011 movdqa xmm3, xmm4 // generate mask 0x00f000f0 | 1011 movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
1012 psrlw xmm3, 8 | 1012 psrlw xmm3, 8 |
1013 | 1013 |
1014 convertloop: | 1014 convertloop: |
1015 movdqu xmm0, [eax] // fetch 4 pixels of argb | 1015 movdqu xmm0, [eax] // fetch 4 pixels of argb |
1016 movdqa xmm1, xmm0 | 1016 movdqa xmm1, xmm0 |
1017 pand xmm0, xmm3 // low nibble | 1017 pand xmm0, xmm3 // low nibble |
1018 pand xmm1, xmm4 // high nibble | 1018 pand xmm1, xmm4 // high nibble |
1019 psrld xmm0, 4 | 1019 psrld xmm0, 4 |
1020 psrld xmm1, 8 | 1020 psrld xmm1, 8 |
1021 por xmm0, xmm1 | 1021 por xmm0, xmm1 |
1022 packuswb xmm0, xmm0 | 1022 packuswb xmm0, xmm0 |
1023 lea eax, [eax + 16] | 1023 lea eax, [eax + 16] |
1024 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 | 1024 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
1025 lea edx, [edx + 8] | 1025 lea edx, [edx + 8] |
1026 sub ecx, 4 | 1026 sub ecx, 4 |
1027 jg convertloop | 1027 jg convertloop |
1028 ret | 1028 ret |
1029 } | 1029 } |
1030 } | 1030 } |
1031 | 1031 |
1032 #ifdef HAS_ARGBTORGB565ROW_AVX2 | 1032 #ifdef HAS_ARGBTORGB565ROW_AVX2 |
1033 __declspec(naked) | 1033 __declspec(naked) |
1034 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1034 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { |
1035 __asm { | 1035 __asm { |
1036 mov eax, [esp + 4] // src_argb | 1036 mov eax, [esp + 4] // src_argb |
1037 mov edx, [esp + 8] // dst_rgb | 1037 mov edx, [esp + 8] // dst_rgb |
1038 mov ecx, [esp + 12] // pix | 1038 mov ecx, [esp + 12] // width |
1039 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f | 1039 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
1040 vpsrld ymm3, ymm3, 27 | 1040 vpsrld ymm3, ymm3, 27 |
1041 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 | 1041 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
1042 vpsrld ymm4, ymm4, 26 | 1042 vpsrld ymm4, ymm4, 26 |
1043 vpslld ymm4, ymm4, 5 | 1043 vpslld ymm4, ymm4, 5 |
1044 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 | 1044 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
1045 | 1045 |
1046 convertloop: | 1046 convertloop: |
1047 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1047 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
1048 vpsrld ymm2, ymm0, 5 // G | 1048 vpsrld ymm2, ymm0, 5 // G |
(...skipping 12 matching lines...) Expand all Loading... |
1061 sub ecx, 8 | 1061 sub ecx, 8 |
1062 jg convertloop | 1062 jg convertloop |
1063 vzeroupper | 1063 vzeroupper |
1064 ret | 1064 ret |
1065 } | 1065 } |
1066 } | 1066 } |
1067 #endif // HAS_ARGBTORGB565ROW_AVX2 | 1067 #endif // HAS_ARGBTORGB565ROW_AVX2 |
1068 | 1068 |
1069 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 | 1069 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 |
1070 __declspec(naked) | 1070 __declspec(naked) |
1071 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1071 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { |
1072 __asm { | 1072 __asm { |
1073 mov eax, [esp + 4] // src_argb | 1073 mov eax, [esp + 4] // src_argb |
1074 mov edx, [esp + 8] // dst_rgb | 1074 mov edx, [esp + 8] // dst_rgb |
1075 mov ecx, [esp + 12] // pix | 1075 mov ecx, [esp + 12] // width |
1076 vpcmpeqb ymm4, ymm4, ymm4 | 1076 vpcmpeqb ymm4, ymm4, ymm4 |
1077 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f | 1077 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f |
1078 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 | 1078 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 |
1079 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 | 1079 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 |
1080 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 | 1080 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 |
1081 vpslld ymm7, ymm7, 15 | 1081 vpslld ymm7, ymm7, 15 |
1082 | 1082 |
1083 convertloop: | 1083 convertloop: |
1084 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1084 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
1085 vpsrld ymm3, ymm0, 9 // R | 1085 vpsrld ymm3, ymm0, 9 // R |
(...skipping 15 matching lines...) Expand all Loading... |
1101 sub ecx, 8 | 1101 sub ecx, 8 |
1102 jg convertloop | 1102 jg convertloop |
1103 vzeroupper | 1103 vzeroupper |
1104 ret | 1104 ret |
1105 } | 1105 } |
1106 } | 1106 } |
1107 #endif // HAS_ARGBTOARGB1555ROW_AVX2 | 1107 #endif // HAS_ARGBTOARGB1555ROW_AVX2 |
1108 | 1108 |
1109 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 | 1109 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 |
1110 __declspec(naked) | 1110 __declspec(naked) |
1111 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1111 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { |
1112 __asm { | 1112 __asm { |
1113 mov eax, [esp + 4] // src_argb | 1113 mov eax, [esp + 4] // src_argb |
1114 mov edx, [esp + 8] // dst_rgb | 1114 mov edx, [esp + 8] // dst_rgb |
1115 mov ecx, [esp + 12] // pix | 1115 mov ecx, [esp + 12] // width |
1116 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 | 1116 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 |
1117 vpsllw ymm4, ymm4, 12 | 1117 vpsllw ymm4, ymm4, 12 |
1118 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 | 1118 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 |
1119 | 1119 |
1120 convertloop: | 1120 convertloop: |
1121 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1121 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
1122 vpand ymm1, ymm0, ymm4 // high nibble | 1122 vpand ymm1, ymm0, ymm4 // high nibble |
1123 vpand ymm0, ymm0, ymm3 // low nibble | 1123 vpand ymm0, ymm0, ymm3 // low nibble |
1124 vpsrld ymm1, ymm1, 8 | 1124 vpsrld ymm1, ymm1, 8 |
1125 vpsrld ymm0, ymm0, 4 | 1125 vpsrld ymm0, ymm0, 4 |
1126 vpor ymm0, ymm0, ymm1 | 1126 vpor ymm0, ymm0, ymm1 |
1127 vpackuswb ymm0, ymm0, ymm0 | 1127 vpackuswb ymm0, ymm0, ymm0 |
1128 vpermq ymm0, ymm0, 0xd8 | 1128 vpermq ymm0, ymm0, 0xd8 |
1129 lea eax, [eax + 32] | 1129 lea eax, [eax + 32] |
1130 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 | 1130 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 |
1131 lea edx, [edx + 16] | 1131 lea edx, [edx + 16] |
1132 sub ecx, 8 | 1132 sub ecx, 8 |
1133 jg convertloop | 1133 jg convertloop |
1134 vzeroupper | 1134 vzeroupper |
1135 ret | 1135 ret |
1136 } | 1136 } |
1137 } | 1137 } |
1138 #endif // HAS_ARGBTOARGB4444ROW_AVX2 | 1138 #endif // HAS_ARGBTOARGB4444ROW_AVX2 |
1139 | 1139 |
1140 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | 1140 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
1141 __declspec(naked) | 1141 __declspec(naked) |
1142 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1142 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
1143 __asm { | 1143 __asm { |
1144 mov eax, [esp + 4] /* src_argb */ | 1144 mov eax, [esp + 4] /* src_argb */ |
1145 mov edx, [esp + 8] /* dst_y */ | 1145 mov edx, [esp + 8] /* dst_y */ |
1146 mov ecx, [esp + 12] /* pix */ | 1146 mov ecx, [esp + 12] /* width */ |
1147 movdqa xmm4, xmmword ptr kARGBToY | 1147 movdqa xmm4, xmmword ptr kARGBToY |
1148 movdqa xmm5, xmmword ptr kAddY16 | 1148 movdqa xmm5, xmmword ptr kAddY16 |
1149 | 1149 |
1150 convertloop: | 1150 convertloop: |
1151 movdqu xmm0, [eax] | 1151 movdqu xmm0, [eax] |
1152 movdqu xmm1, [eax + 16] | 1152 movdqu xmm1, [eax + 16] |
1153 movdqu xmm2, [eax + 32] | 1153 movdqu xmm2, [eax + 32] |
1154 movdqu xmm3, [eax + 48] | 1154 movdqu xmm3, [eax + 48] |
1155 pmaddubsw xmm0, xmm4 | 1155 pmaddubsw xmm0, xmm4 |
1156 pmaddubsw xmm1, xmm4 | 1156 pmaddubsw xmm1, xmm4 |
(...skipping 10 matching lines...) Expand all Loading... |
1167 lea edx, [edx + 16] | 1167 lea edx, [edx + 16] |
1168 sub ecx, 16 | 1168 sub ecx, 16 |
1169 jg convertloop | 1169 jg convertloop |
1170 ret | 1170 ret |
1171 } | 1171 } |
1172 } | 1172 } |
1173 | 1173 |
1174 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | 1174 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
1175 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | 1175 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
1176 __declspec(naked) | 1176 __declspec(naked) |
1177 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1177 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
1178 __asm { | 1178 __asm { |
1179 mov eax, [esp + 4] /* src_argb */ | 1179 mov eax, [esp + 4] /* src_argb */ |
1180 mov edx, [esp + 8] /* dst_y */ | 1180 mov edx, [esp + 8] /* dst_y */ |
1181 mov ecx, [esp + 12] /* pix */ | 1181 mov ecx, [esp + 12] /* width */ |
1182 movdqa xmm4, xmmword ptr kARGBToYJ | 1182 movdqa xmm4, xmmword ptr kARGBToYJ |
1183 movdqa xmm5, xmmword ptr kAddYJ64 | 1183 movdqa xmm5, xmmword ptr kAddYJ64 |
1184 | 1184 |
1185 convertloop: | 1185 convertloop: |
1186 movdqu xmm0, [eax] | 1186 movdqu xmm0, [eax] |
1187 movdqu xmm1, [eax + 16] | 1187 movdqu xmm1, [eax + 16] |
1188 movdqu xmm2, [eax + 32] | 1188 movdqu xmm2, [eax + 32] |
1189 movdqu xmm3, [eax + 48] | 1189 movdqu xmm3, [eax + 48] |
1190 pmaddubsw xmm0, xmm4 | 1190 pmaddubsw xmm0, xmm4 |
1191 pmaddubsw xmm1, xmm4 | 1191 pmaddubsw xmm1, xmm4 |
(...skipping 16 matching lines...) Expand all Loading... |
1208 } | 1208 } |
1209 | 1209 |
1210 #ifdef HAS_ARGBTOYROW_AVX2 | 1210 #ifdef HAS_ARGBTOYROW_AVX2 |
1211 // vpermd for vphaddw + vpackuswb vpermd. | 1211 // vpermd for vphaddw + vpackuswb vpermd. |
1212 static const lvec32 kPermdARGBToY_AVX = { | 1212 static const lvec32 kPermdARGBToY_AVX = { |
1213 0, 4, 1, 5, 2, 6, 3, 7 | 1213 0, 4, 1, 5, 2, 6, 3, 7 |
1214 }; | 1214 }; |
1215 | 1215 |
1216 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1216 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
1217 __declspec(naked) | 1217 __declspec(naked) |
1218 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1218 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
1219 __asm { | 1219 __asm { |
1220 mov eax, [esp + 4] /* src_argb */ | 1220 mov eax, [esp + 4] /* src_argb */ |
1221 mov edx, [esp + 8] /* dst_y */ | 1221 mov edx, [esp + 8] /* dst_y */ |
1222 mov ecx, [esp + 12] /* pix */ | 1222 mov ecx, [esp + 12] /* width */ |
1223 vbroadcastf128 ymm4, xmmword ptr kARGBToY | 1223 vbroadcastf128 ymm4, xmmword ptr kARGBToY |
1224 vbroadcastf128 ymm5, xmmword ptr kAddY16 | 1224 vbroadcastf128 ymm5, xmmword ptr kAddY16 |
1225 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX | 1225 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX |
1226 | 1226 |
1227 convertloop: | 1227 convertloop: |
1228 vmovdqu ymm0, [eax] | 1228 vmovdqu ymm0, [eax] |
1229 vmovdqu ymm1, [eax + 32] | 1229 vmovdqu ymm1, [eax + 32] |
1230 vmovdqu ymm2, [eax + 64] | 1230 vmovdqu ymm2, [eax + 64] |
1231 vmovdqu ymm3, [eax + 96] | 1231 vmovdqu ymm3, [eax + 96] |
1232 vpmaddubsw ymm0, ymm0, ymm4 | 1232 vpmaddubsw ymm0, ymm0, ymm4 |
(...skipping 14 matching lines...) Expand all Loading... |
1247 jg convertloop | 1247 jg convertloop |
1248 vzeroupper | 1248 vzeroupper |
1249 ret | 1249 ret |
1250 } | 1250 } |
1251 } | 1251 } |
1252 #endif // HAS_ARGBTOYROW_AVX2 | 1252 #endif // HAS_ARGBTOYROW_AVX2 |
1253 | 1253 |
1254 #ifdef HAS_ARGBTOYJROW_AVX2 | 1254 #ifdef HAS_ARGBTOYJROW_AVX2 |
1255 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1255 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
1256 __declspec(naked) | 1256 __declspec(naked) |
1257 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1257 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
1258 __asm { | 1258 __asm { |
1259 mov eax, [esp + 4] /* src_argb */ | 1259 mov eax, [esp + 4] /* src_argb */ |
1260 mov edx, [esp + 8] /* dst_y */ | 1260 mov edx, [esp + 8] /* dst_y */ |
1261 mov ecx, [esp + 12] /* pix */ | 1261 mov ecx, [esp + 12] /* width */ |
1262 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ | 1262 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ |
1263 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 | 1263 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 |
1264 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX | 1264 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX |
1265 | 1265 |
1266 convertloop: | 1266 convertloop: |
1267 vmovdqu ymm0, [eax] | 1267 vmovdqu ymm0, [eax] |
1268 vmovdqu ymm1, [eax + 32] | 1268 vmovdqu ymm1, [eax + 32] |
1269 vmovdqu ymm2, [eax + 64] | 1269 vmovdqu ymm2, [eax + 64] |
1270 vmovdqu ymm3, [eax + 96] | 1270 vmovdqu ymm3, [eax + 96] |
1271 vpmaddubsw ymm0, ymm0, ymm4 | 1271 vpmaddubsw ymm0, ymm0, ymm4 |
(...skipping 14 matching lines...) Expand all Loading... |
1286 sub ecx, 32 | 1286 sub ecx, 32 |
1287 jg convertloop | 1287 jg convertloop |
1288 | 1288 |
1289 vzeroupper | 1289 vzeroupper |
1290 ret | 1290 ret |
1291 } | 1291 } |
1292 } | 1292 } |
1293 #endif // HAS_ARGBTOYJROW_AVX2 | 1293 #endif // HAS_ARGBTOYJROW_AVX2 |
1294 | 1294 |
1295 __declspec(naked) | 1295 __declspec(naked) |
1296 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1296 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
1297 __asm { | 1297 __asm { |
1298 mov eax, [esp + 4] /* src_argb */ | 1298 mov eax, [esp + 4] /* src_argb */ |
1299 mov edx, [esp + 8] /* dst_y */ | 1299 mov edx, [esp + 8] /* dst_y */ |
1300 mov ecx, [esp + 12] /* pix */ | 1300 mov ecx, [esp + 12] /* width */ |
1301 movdqa xmm4, xmmword ptr kBGRAToY | 1301 movdqa xmm4, xmmword ptr kBGRAToY |
1302 movdqa xmm5, xmmword ptr kAddY16 | 1302 movdqa xmm5, xmmword ptr kAddY16 |
1303 | 1303 |
1304 convertloop: | 1304 convertloop: |
1305 movdqu xmm0, [eax] | 1305 movdqu xmm0, [eax] |
1306 movdqu xmm1, [eax + 16] | 1306 movdqu xmm1, [eax + 16] |
1307 movdqu xmm2, [eax + 32] | 1307 movdqu xmm2, [eax + 32] |
1308 movdqu xmm3, [eax + 48] | 1308 movdqu xmm3, [eax + 48] |
1309 pmaddubsw xmm0, xmm4 | 1309 pmaddubsw xmm0, xmm4 |
1310 pmaddubsw xmm1, xmm4 | 1310 pmaddubsw xmm1, xmm4 |
1311 pmaddubsw xmm2, xmm4 | 1311 pmaddubsw xmm2, xmm4 |
1312 pmaddubsw xmm3, xmm4 | 1312 pmaddubsw xmm3, xmm4 |
1313 lea eax, [eax + 64] | 1313 lea eax, [eax + 64] |
1314 phaddw xmm0, xmm1 | 1314 phaddw xmm0, xmm1 |
1315 phaddw xmm2, xmm3 | 1315 phaddw xmm2, xmm3 |
1316 psrlw xmm0, 7 | 1316 psrlw xmm0, 7 |
1317 psrlw xmm2, 7 | 1317 psrlw xmm2, 7 |
1318 packuswb xmm0, xmm2 | 1318 packuswb xmm0, xmm2 |
1319 paddb xmm0, xmm5 | 1319 paddb xmm0, xmm5 |
1320 movdqu [edx], xmm0 | 1320 movdqu [edx], xmm0 |
1321 lea edx, [edx + 16] | 1321 lea edx, [edx + 16] |
1322 sub ecx, 16 | 1322 sub ecx, 16 |
1323 jg convertloop | 1323 jg convertloop |
1324 ret | 1324 ret |
1325 } | 1325 } |
1326 } | 1326 } |
1327 | 1327 |
1328 __declspec(naked) | 1328 __declspec(naked) |
1329 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1329 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
1330 __asm { | 1330 __asm { |
1331 mov eax, [esp + 4] /* src_argb */ | 1331 mov eax, [esp + 4] /* src_argb */ |
1332 mov edx, [esp + 8] /* dst_y */ | 1332 mov edx, [esp + 8] /* dst_y */ |
1333 mov ecx, [esp + 12] /* pix */ | 1333 mov ecx, [esp + 12] /* width */ |
1334 movdqa xmm4, xmmword ptr kABGRToY | 1334 movdqa xmm4, xmmword ptr kABGRToY |
1335 movdqa xmm5, xmmword ptr kAddY16 | 1335 movdqa xmm5, xmmword ptr kAddY16 |
1336 | 1336 |
1337 convertloop: | 1337 convertloop: |
1338 movdqu xmm0, [eax] | 1338 movdqu xmm0, [eax] |
1339 movdqu xmm1, [eax + 16] | 1339 movdqu xmm1, [eax + 16] |
1340 movdqu xmm2, [eax + 32] | 1340 movdqu xmm2, [eax + 32] |
1341 movdqu xmm3, [eax + 48] | 1341 movdqu xmm3, [eax + 48] |
1342 pmaddubsw xmm0, xmm4 | 1342 pmaddubsw xmm0, xmm4 |
1343 pmaddubsw xmm1, xmm4 | 1343 pmaddubsw xmm1, xmm4 |
1344 pmaddubsw xmm2, xmm4 | 1344 pmaddubsw xmm2, xmm4 |
1345 pmaddubsw xmm3, xmm4 | 1345 pmaddubsw xmm3, xmm4 |
1346 lea eax, [eax + 64] | 1346 lea eax, [eax + 64] |
1347 phaddw xmm0, xmm1 | 1347 phaddw xmm0, xmm1 |
1348 phaddw xmm2, xmm3 | 1348 phaddw xmm2, xmm3 |
1349 psrlw xmm0, 7 | 1349 psrlw xmm0, 7 |
1350 psrlw xmm2, 7 | 1350 psrlw xmm2, 7 |
1351 packuswb xmm0, xmm2 | 1351 packuswb xmm0, xmm2 |
1352 paddb xmm0, xmm5 | 1352 paddb xmm0, xmm5 |
1353 movdqu [edx], xmm0 | 1353 movdqu [edx], xmm0 |
1354 lea edx, [edx + 16] | 1354 lea edx, [edx + 16] |
1355 sub ecx, 16 | 1355 sub ecx, 16 |
1356 jg convertloop | 1356 jg convertloop |
1357 ret | 1357 ret |
1358 } | 1358 } |
1359 } | 1359 } |
1360 | 1360 |
1361 __declspec(naked) | 1361 __declspec(naked) |
1362 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1362 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
1363 __asm { | 1363 __asm { |
1364 mov eax, [esp + 4] /* src_argb */ | 1364 mov eax, [esp + 4] /* src_argb */ |
1365 mov edx, [esp + 8] /* dst_y */ | 1365 mov edx, [esp + 8] /* dst_y */ |
1366 mov ecx, [esp + 12] /* pix */ | 1366 mov ecx, [esp + 12] /* width */ |
1367 movdqa xmm4, xmmword ptr kRGBAToY | 1367 movdqa xmm4, xmmword ptr kRGBAToY |
1368 movdqa xmm5, xmmword ptr kAddY16 | 1368 movdqa xmm5, xmmword ptr kAddY16 |
1369 | 1369 |
1370 convertloop: | 1370 convertloop: |
1371 movdqu xmm0, [eax] | 1371 movdqu xmm0, [eax] |
1372 movdqu xmm1, [eax + 16] | 1372 movdqu xmm1, [eax + 16] |
1373 movdqu xmm2, [eax + 32] | 1373 movdqu xmm2, [eax + 32] |
1374 movdqu xmm3, [eax + 48] | 1374 movdqu xmm3, [eax + 48] |
1375 pmaddubsw xmm0, xmm4 | 1375 pmaddubsw xmm0, xmm4 |
1376 pmaddubsw xmm1, xmm4 | 1376 pmaddubsw xmm1, xmm4 |
(...skipping 17 matching lines...) Expand all Loading... |
1394 __declspec(naked) | 1394 __declspec(naked) |
1395 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1395 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1396 uint8* dst_u, uint8* dst_v, int width) { | 1396 uint8* dst_u, uint8* dst_v, int width) { |
1397 __asm { | 1397 __asm { |
1398 push esi | 1398 push esi |
1399 push edi | 1399 push edi |
1400 mov eax, [esp + 8 + 4] // src_argb | 1400 mov eax, [esp + 8 + 4] // src_argb |
1401 mov esi, [esp + 8 + 8] // src_stride_argb | 1401 mov esi, [esp + 8 + 8] // src_stride_argb |
1402 mov edx, [esp + 8 + 12] // dst_u | 1402 mov edx, [esp + 8 + 12] // dst_u |
1403 mov edi, [esp + 8 + 16] // dst_v | 1403 mov edi, [esp + 8 + 16] // dst_v |
1404 mov ecx, [esp + 8 + 20] // pix | 1404 mov ecx, [esp + 8 + 20] // width |
1405 movdqa xmm5, xmmword ptr kAddUV128 | 1405 movdqa xmm5, xmmword ptr kAddUV128 |
1406 movdqa xmm6, xmmword ptr kARGBToV | 1406 movdqa xmm6, xmmword ptr kARGBToV |
1407 movdqa xmm7, xmmword ptr kARGBToU | 1407 movdqa xmm7, xmmword ptr kARGBToU |
1408 sub edi, edx // stride from u to v | 1408 sub edi, edx // stride from u to v |
1409 | 1409 |
1410 convertloop: | 1410 convertloop: |
1411 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1411 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
1412 movdqu xmm0, [eax] | 1412 movdqu xmm0, [eax] |
1413 movdqu xmm4, [eax + esi] | 1413 movdqu xmm4, [eax + esi] |
1414 pavgb xmm0, xmm4 | 1414 pavgb xmm0, xmm4 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1464 __declspec(naked) | 1464 __declspec(naked) |
1465 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1465 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1466 uint8* dst_u, uint8* dst_v, int width) { | 1466 uint8* dst_u, uint8* dst_v, int width) { |
1467 __asm { | 1467 __asm { |
1468 push esi | 1468 push esi |
1469 push edi | 1469 push edi |
1470 mov eax, [esp + 8 + 4] // src_argb | 1470 mov eax, [esp + 8 + 4] // src_argb |
1471 mov esi, [esp + 8 + 8] // src_stride_argb | 1471 mov esi, [esp + 8 + 8] // src_stride_argb |
1472 mov edx, [esp + 8 + 12] // dst_u | 1472 mov edx, [esp + 8 + 12] // dst_u |
1473 mov edi, [esp + 8 + 16] // dst_v | 1473 mov edi, [esp + 8 + 16] // dst_v |
1474 mov ecx, [esp + 8 + 20] // pix | 1474 mov ecx, [esp + 8 + 20] // width |
1475 movdqa xmm5, xmmword ptr kAddUVJ128 | 1475 movdqa xmm5, xmmword ptr kAddUVJ128 |
1476 movdqa xmm6, xmmword ptr kARGBToVJ | 1476 movdqa xmm6, xmmword ptr kARGBToVJ |
1477 movdqa xmm7, xmmword ptr kARGBToUJ | 1477 movdqa xmm7, xmmword ptr kARGBToUJ |
1478 sub edi, edx // stride from u to v | 1478 sub edi, edx // stride from u to v |
1479 | 1479 |
1480 convertloop: | 1480 convertloop: |
1481 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1481 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
1482 movdqu xmm0, [eax] | 1482 movdqu xmm0, [eax] |
1483 movdqu xmm4, [eax + esi] | 1483 movdqu xmm4, [eax + esi] |
1484 pavgb xmm0, xmm4 | 1484 pavgb xmm0, xmm4 |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1536 __declspec(naked) | 1536 __declspec(naked) |
1537 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 1537 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
1538 uint8* dst_u, uint8* dst_v, int width) { | 1538 uint8* dst_u, uint8* dst_v, int width) { |
1539 __asm { | 1539 __asm { |
1540 push esi | 1540 push esi |
1541 push edi | 1541 push edi |
1542 mov eax, [esp + 8 + 4] // src_argb | 1542 mov eax, [esp + 8 + 4] // src_argb |
1543 mov esi, [esp + 8 + 8] // src_stride_argb | 1543 mov esi, [esp + 8 + 8] // src_stride_argb |
1544 mov edx, [esp + 8 + 12] // dst_u | 1544 mov edx, [esp + 8 + 12] // dst_u |
1545 mov edi, [esp + 8 + 16] // dst_v | 1545 mov edi, [esp + 8 + 16] // dst_v |
1546 mov ecx, [esp + 8 + 20] // pix | 1546 mov ecx, [esp + 8 + 20] // width |
1547 vbroadcastf128 ymm5, xmmword ptr kAddUV128 | 1547 vbroadcastf128 ymm5, xmmword ptr kAddUV128 |
1548 vbroadcastf128 ymm6, xmmword ptr kARGBToV | 1548 vbroadcastf128 ymm6, xmmword ptr kARGBToV |
1549 vbroadcastf128 ymm7, xmmword ptr kARGBToU | 1549 vbroadcastf128 ymm7, xmmword ptr kARGBToU |
1550 sub edi, edx // stride from u to v | 1550 sub edi, edx // stride from u to v |
1551 | 1551 |
1552 convertloop: | 1552 convertloop: |
1553 /* step 1 - subsample 32x2 argb pixels to 16x1 */ | 1553 /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
1554 vmovdqu ymm0, [eax] | 1554 vmovdqu ymm0, [eax] |
1555 vmovdqu ymm1, [eax + 32] | 1555 vmovdqu ymm1, [eax + 32] |
1556 vmovdqu ymm2, [eax + 64] | 1556 vmovdqu ymm2, [eax + 64] |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1599 #endif // HAS_ARGBTOUVROW_AVX2 | 1599 #endif // HAS_ARGBTOUVROW_AVX2 |
1600 | 1600 |
1601 __declspec(naked) | 1601 __declspec(naked) |
1602 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, | 1602 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
1603 uint8* dst_u, uint8* dst_v, int width) { | 1603 uint8* dst_u, uint8* dst_v, int width) { |
1604 __asm { | 1604 __asm { |
1605 push edi | 1605 push edi |
1606 mov eax, [esp + 4 + 4] // src_argb | 1606 mov eax, [esp + 4 + 4] // src_argb |
1607 mov edx, [esp + 4 + 8] // dst_u | 1607 mov edx, [esp + 4 + 8] // dst_u |
1608 mov edi, [esp + 4 + 12] // dst_v | 1608 mov edi, [esp + 4 + 12] // dst_v |
1609 mov ecx, [esp + 4 + 16] // pix | 1609 mov ecx, [esp + 4 + 16] // width |
1610 movdqa xmm5, xmmword ptr kAddUV128 | 1610 movdqa xmm5, xmmword ptr kAddUV128 |
1611 movdqa xmm6, xmmword ptr kARGBToV | 1611 movdqa xmm6, xmmword ptr kARGBToV |
1612 movdqa xmm7, xmmword ptr kARGBToU | 1612 movdqa xmm7, xmmword ptr kARGBToU |
1613 sub edi, edx // stride from u to v | 1613 sub edi, edx // stride from u to v |
1614 | 1614 |
1615 convertloop: | 1615 convertloop: |
1616 /* convert to U and V */ | 1616 /* convert to U and V */ |
1617 movdqu xmm0, [eax] // U | 1617 movdqu xmm0, [eax] // U |
1618 movdqu xmm1, [eax + 16] | 1618 movdqu xmm1, [eax + 16] |
1619 movdqu xmm2, [eax + 32] | 1619 movdqu xmm2, [eax + 32] |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1656 } | 1656 } |
1657 | 1657 |
1658 __declspec(naked) | 1658 __declspec(naked) |
1659 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | 1659 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
1660 uint8* dst_u, uint8* dst_v, int width) { | 1660 uint8* dst_u, uint8* dst_v, int width) { |
1661 __asm { | 1661 __asm { |
1662 push edi | 1662 push edi |
1663 mov eax, [esp + 4 + 4] // src_argb | 1663 mov eax, [esp + 4 + 4] // src_argb |
1664 mov edx, [esp + 4 + 8] // dst_u | 1664 mov edx, [esp + 4 + 8] // dst_u |
1665 mov edi, [esp + 4 + 12] // dst_v | 1665 mov edi, [esp + 4 + 12] // dst_v |
1666 mov ecx, [esp + 4 + 16] // pix | 1666 mov ecx, [esp + 4 + 16] // width |
1667 movdqa xmm5, xmmword ptr kAddUV128 | 1667 movdqa xmm5, xmmword ptr kAddUV128 |
1668 movdqa xmm6, xmmword ptr kARGBToV | 1668 movdqa xmm6, xmmword ptr kARGBToV |
1669 movdqa xmm7, xmmword ptr kARGBToU | 1669 movdqa xmm7, xmmword ptr kARGBToU |
1670 sub edi, edx // stride from u to v | 1670 sub edi, edx // stride from u to v |
1671 | 1671 |
1672 convertloop: | 1672 convertloop: |
1673 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1673 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
1674 movdqu xmm0, [eax] | 1674 movdqu xmm0, [eax] |
1675 movdqu xmm1, [eax + 16] | 1675 movdqu xmm1, [eax + 16] |
1676 movdqu xmm2, [eax + 32] | 1676 movdqu xmm2, [eax + 32] |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1716 __declspec(naked) | 1716 __declspec(naked) |
1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1718 uint8* dst_u, uint8* dst_v, int width) { | 1718 uint8* dst_u, uint8* dst_v, int width) { |
1719 __asm { | 1719 __asm { |
1720 push esi | 1720 push esi |
1721 push edi | 1721 push edi |
1722 mov eax, [esp + 8 + 4] // src_argb | 1722 mov eax, [esp + 8 + 4] // src_argb |
1723 mov esi, [esp + 8 + 8] // src_stride_argb | 1723 mov esi, [esp + 8 + 8] // src_stride_argb |
1724 mov edx, [esp + 8 + 12] // dst_u | 1724 mov edx, [esp + 8 + 12] // dst_u |
1725 mov edi, [esp + 8 + 16] // dst_v | 1725 mov edi, [esp + 8 + 16] // dst_v |
1726 mov ecx, [esp + 8 + 20] // pix | 1726 mov ecx, [esp + 8 + 20] // width |
1727 movdqa xmm5, xmmword ptr kAddUV128 | 1727 movdqa xmm5, xmmword ptr kAddUV128 |
1728 movdqa xmm6, xmmword ptr kBGRAToV | 1728 movdqa xmm6, xmmword ptr kBGRAToV |
1729 movdqa xmm7, xmmword ptr kBGRAToU | 1729 movdqa xmm7, xmmword ptr kBGRAToU |
1730 sub edi, edx // stride from u to v | 1730 sub edi, edx // stride from u to v |
1731 | 1731 |
1732 convertloop: | 1732 convertloop: |
1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
1734 movdqu xmm0, [eax] | 1734 movdqu xmm0, [eax] |
1735 movdqu xmm4, [eax + esi] | 1735 movdqu xmm4, [eax + esi] |
1736 pavgb xmm0, xmm4 | 1736 pavgb xmm0, xmm4 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1786 __declspec(naked) | 1786 __declspec(naked) |
1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1788 uint8* dst_u, uint8* dst_v, int width) { | 1788 uint8* dst_u, uint8* dst_v, int width) { |
1789 __asm { | 1789 __asm { |
1790 push esi | 1790 push esi |
1791 push edi | 1791 push edi |
1792 mov eax, [esp + 8 + 4] // src_argb | 1792 mov eax, [esp + 8 + 4] // src_argb |
1793 mov esi, [esp + 8 + 8] // src_stride_argb | 1793 mov esi, [esp + 8 + 8] // src_stride_argb |
1794 mov edx, [esp + 8 + 12] // dst_u | 1794 mov edx, [esp + 8 + 12] // dst_u |
1795 mov edi, [esp + 8 + 16] // dst_v | 1795 mov edi, [esp + 8 + 16] // dst_v |
1796 mov ecx, [esp + 8 + 20] // pix | 1796 mov ecx, [esp + 8 + 20] // width |
1797 movdqa xmm5, xmmword ptr kAddUV128 | 1797 movdqa xmm5, xmmword ptr kAddUV128 |
1798 movdqa xmm6, xmmword ptr kABGRToV | 1798 movdqa xmm6, xmmword ptr kABGRToV |
1799 movdqa xmm7, xmmword ptr kABGRToU | 1799 movdqa xmm7, xmmword ptr kABGRToU |
1800 sub edi, edx // stride from u to v | 1800 sub edi, edx // stride from u to v |
1801 | 1801 |
1802 convertloop: | 1802 convertloop: |
1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
1804 movdqu xmm0, [eax] | 1804 movdqu xmm0, [eax] |
1805 movdqu xmm4, [eax + esi] | 1805 movdqu xmm4, [eax + esi] |
1806 pavgb xmm0, xmm4 | 1806 pavgb xmm0, xmm4 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1856 __declspec(naked) | 1856 __declspec(naked) |
1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1858 uint8* dst_u, uint8* dst_v, int width) { | 1858 uint8* dst_u, uint8* dst_v, int width) { |
1859 __asm { | 1859 __asm { |
1860 push esi | 1860 push esi |
1861 push edi | 1861 push edi |
1862 mov eax, [esp + 8 + 4] // src_argb | 1862 mov eax, [esp + 8 + 4] // src_argb |
1863 mov esi, [esp + 8 + 8] // src_stride_argb | 1863 mov esi, [esp + 8 + 8] // src_stride_argb |
1864 mov edx, [esp + 8 + 12] // dst_u | 1864 mov edx, [esp + 8 + 12] // dst_u |
1865 mov edi, [esp + 8 + 16] // dst_v | 1865 mov edi, [esp + 8 + 16] // dst_v |
1866 mov ecx, [esp + 8 + 20] // pix | 1866 mov ecx, [esp + 8 + 20] // width |
1867 movdqa xmm5, xmmword ptr kAddUV128 | 1867 movdqa xmm5, xmmword ptr kAddUV128 |
1868 movdqa xmm6, xmmword ptr kRGBAToV | 1868 movdqa xmm6, xmmword ptr kRGBAToV |
1869 movdqa xmm7, xmmword ptr kRGBAToU | 1869 movdqa xmm7, xmmword ptr kRGBAToU |
1870 sub edi, edx // stride from u to v | 1870 sub edi, edx // stride from u to v |
1871 | 1871 |
1872 convertloop: | 1872 convertloop: |
1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
1874 movdqu xmm0, [eax] | 1874 movdqu xmm0, [eax] |
1875 movdqu xmm4, [eax + esi] | 1875 movdqu xmm4, [eax + esi] |
1876 pavgb xmm0, xmm4 | 1876 pavgb xmm0, xmm4 |
(...skipping 1754 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3631 sub ecx, 8 | 3631 sub ecx, 8 |
3632 jg convertloop | 3632 jg convertloop |
3633 vzeroupper | 3633 vzeroupper |
3634 ret | 3634 ret |
3635 } | 3635 } |
3636 } | 3636 } |
3637 #endif // HAS_ARGBMIRRORROW_AVX2 | 3637 #endif // HAS_ARGBMIRRORROW_AVX2 |
3638 | 3638 |
3639 #ifdef HAS_SPLITUVROW_SSE2 | 3639 #ifdef HAS_SPLITUVROW_SSE2 |
3640 __declspec(naked) | 3640 __declspec(naked) |
3641 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3641 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width)
{ |
3642 __asm { | 3642 __asm { |
3643 push edi | 3643 push edi |
3644 mov eax, [esp + 4 + 4] // src_uv | 3644 mov eax, [esp + 4 + 4] // src_uv |
3645 mov edx, [esp + 4 + 8] // dst_u | 3645 mov edx, [esp + 4 + 8] // dst_u |
3646 mov edi, [esp + 4 + 12] // dst_v | 3646 mov edi, [esp + 4 + 12] // dst_v |
3647 mov ecx, [esp + 4 + 16] // pix | 3647 mov ecx, [esp + 4 + 16] // width |
3648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
3649 psrlw xmm5, 8 | 3649 psrlw xmm5, 8 |
3650 sub edi, edx | 3650 sub edi, edx |
3651 | 3651 |
3652 convertloop: | 3652 convertloop: |
3653 movdqu xmm0, [eax] | 3653 movdqu xmm0, [eax] |
3654 movdqu xmm1, [eax + 16] | 3654 movdqu xmm1, [eax + 16] |
3655 lea eax, [eax + 32] | 3655 lea eax, [eax + 32] |
3656 movdqa xmm2, xmm0 | 3656 movdqa xmm2, xmm0 |
3657 movdqa xmm3, xmm1 | 3657 movdqa xmm3, xmm1 |
(...skipping 11 matching lines...) Expand all Loading... |
3669 | 3669 |
3670 pop edi | 3670 pop edi |
3671 ret | 3671 ret |
3672 } | 3672 } |
3673 } | 3673 } |
3674 | 3674 |
3675 #endif // HAS_SPLITUVROW_SSE2 | 3675 #endif // HAS_SPLITUVROW_SSE2 |
3676 | 3676 |
3677 #ifdef HAS_SPLITUVROW_AVX2 | 3677 #ifdef HAS_SPLITUVROW_AVX2 |
3678 __declspec(naked) | 3678 __declspec(naked) |
3679 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3679 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width)
{ |
3680 __asm { | 3680 __asm { |
3681 push edi | 3681 push edi |
3682 mov eax, [esp + 4 + 4] // src_uv | 3682 mov eax, [esp + 4 + 4] // src_uv |
3683 mov edx, [esp + 4 + 8] // dst_u | 3683 mov edx, [esp + 4 + 8] // dst_u |
3684 mov edi, [esp + 4 + 12] // dst_v | 3684 mov edi, [esp + 4 + 12] // dst_v |
3685 mov ecx, [esp + 4 + 16] // pix | 3685 mov ecx, [esp + 4 + 16] // width |
3686 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3686 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
3687 vpsrlw ymm5, ymm5, 8 | 3687 vpsrlw ymm5, ymm5, 8 |
3688 sub edi, edx | 3688 sub edi, edx |
3689 | 3689 |
3690 convertloop: | 3690 convertloop: |
3691 vmovdqu ymm0, [eax] | 3691 vmovdqu ymm0, [eax] |
3692 vmovdqu ymm1, [eax + 32] | 3692 vmovdqu ymm1, [eax + 32] |
3693 lea eax, [eax + 64] | 3693 lea eax, [eax + 64] |
3694 vpsrlw ymm2, ymm0, 8 // odd bytes | 3694 vpsrlw ymm2, ymm0, 8 // odd bytes |
3695 vpsrlw ymm3, ymm1, 8 | 3695 vpsrlw ymm3, ymm1, 8 |
(...skipping 321 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4017 mov ecx, [esp + 12] // count | 4017 mov ecx, [esp + 12] // count |
4018 rep stosd | 4018 rep stosd |
4019 mov edi, edx | 4019 mov edi, edx |
4020 ret | 4020 ret |
4021 } | 4021 } |
4022 } | 4022 } |
4023 #endif // HAS_SETROW_X86 | 4023 #endif // HAS_SETROW_X86 |
4024 | 4024 |
4025 #ifdef HAS_YUY2TOYROW_AVX2 | 4025 #ifdef HAS_YUY2TOYROW_AVX2 |
4026 __declspec(naked) | 4026 __declspec(naked) |
4027 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { | 4027 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { |
4028 __asm { | 4028 __asm { |
4029 mov eax, [esp + 4] // src_yuy2 | 4029 mov eax, [esp + 4] // src_yuy2 |
4030 mov edx, [esp + 8] // dst_y | 4030 mov edx, [esp + 8] // dst_y |
4031 mov ecx, [esp + 12] // pix | 4031 mov ecx, [esp + 12] // width |
4032 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4032 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
4033 vpsrlw ymm5, ymm5, 8 | 4033 vpsrlw ymm5, ymm5, 8 |
4034 | 4034 |
4035 convertloop: | 4035 convertloop: |
4036 vmovdqu ymm0, [eax] | 4036 vmovdqu ymm0, [eax] |
4037 vmovdqu ymm1, [eax + 32] | 4037 vmovdqu ymm1, [eax + 32] |
4038 lea eax, [eax + 64] | 4038 lea eax, [eax + 64] |
4039 vpand ymm0, ymm0, ymm5 // even bytes are Y | 4039 vpand ymm0, ymm0, ymm5 // even bytes are Y |
4040 vpand ymm1, ymm1, ymm5 | 4040 vpand ymm1, ymm1, ymm5 |
4041 vpackuswb ymm0, ymm0, ymm1 // mutates. | 4041 vpackuswb ymm0, ymm0, ymm1 // mutates. |
4042 vpermq ymm0, ymm0, 0xd8 | 4042 vpermq ymm0, ymm0, 0xd8 |
4043 vmovdqu [edx], ymm0 | 4043 vmovdqu [edx], ymm0 |
4044 lea edx, [edx + 32] | 4044 lea edx, [edx + 32] |
4045 sub ecx, 32 | 4045 sub ecx, 32 |
4046 jg convertloop | 4046 jg convertloop |
4047 vzeroupper | 4047 vzeroupper |
4048 ret | 4048 ret |
4049 } | 4049 } |
4050 } | 4050 } |
4051 | 4051 |
4052 __declspec(naked) | 4052 __declspec(naked) |
4053 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 4053 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
4054 uint8* dst_u, uint8* dst_v, int pix) { | 4054 uint8* dst_u, uint8* dst_v, int width) { |
4055 __asm { | 4055 __asm { |
4056 push esi | 4056 push esi |
4057 push edi | 4057 push edi |
4058 mov eax, [esp + 8 + 4] // src_yuy2 | 4058 mov eax, [esp + 8 + 4] // src_yuy2 |
4059 mov esi, [esp + 8 + 8] // stride_yuy2 | 4059 mov esi, [esp + 8 + 8] // stride_yuy2 |
4060 mov edx, [esp + 8 + 12] // dst_u | 4060 mov edx, [esp + 8 + 12] // dst_u |
4061 mov edi, [esp + 8 + 16] // dst_v | 4061 mov edi, [esp + 8 + 16] // dst_v |
4062 mov ecx, [esp + 8 + 20] // pix | 4062 mov ecx, [esp + 8 + 20] // width |
4063 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4063 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
4064 vpsrlw ymm5, ymm5, 8 | 4064 vpsrlw ymm5, ymm5, 8 |
4065 sub edi, edx | 4065 sub edi, edx |
4066 | 4066 |
4067 convertloop: | 4067 convertloop: |
4068 vmovdqu ymm0, [eax] | 4068 vmovdqu ymm0, [eax] |
4069 vmovdqu ymm1, [eax + 32] | 4069 vmovdqu ymm1, [eax + 32] |
4070 vpavgb ymm0, ymm0, [eax + esi] | 4070 vpavgb ymm0, ymm0, [eax + esi] |
4071 vpavgb ymm1, ymm1, [eax + esi + 32] | 4071 vpavgb ymm1, ymm1, [eax + esi + 32] |
4072 lea eax, [eax + 64] | 4072 lea eax, [eax + 64] |
(...skipping 15 matching lines...) Expand all Loading... |
4088 | 4088 |
4089 pop edi | 4089 pop edi |
4090 pop esi | 4090 pop esi |
4091 vzeroupper | 4091 vzeroupper |
4092 ret | 4092 ret |
4093 } | 4093 } |
4094 } | 4094 } |
4095 | 4095 |
4096 __declspec(naked) | 4096 __declspec(naked) |
4097 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 4097 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
4098 uint8* dst_u, uint8* dst_v, int pix) { | 4098 uint8* dst_u, uint8* dst_v, int width) { |
4099 __asm { | 4099 __asm { |
4100 push edi | 4100 push edi |
4101 mov eax, [esp + 4 + 4] // src_yuy2 | 4101 mov eax, [esp + 4 + 4] // src_yuy2 |
4102 mov edx, [esp + 4 + 8] // dst_u | 4102 mov edx, [esp + 4 + 8] // dst_u |
4103 mov edi, [esp + 4 + 12] // dst_v | 4103 mov edi, [esp + 4 + 12] // dst_v |
4104 mov ecx, [esp + 4 + 16] // pix | 4104 mov ecx, [esp + 4 + 16] // width |
4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
4106 vpsrlw ymm5, ymm5, 8 | 4106 vpsrlw ymm5, ymm5, 8 |
4107 sub edi, edx | 4107 sub edi, edx |
4108 | 4108 |
4109 convertloop: | 4109 convertloop: |
4110 vmovdqu ymm0, [eax] | 4110 vmovdqu ymm0, [eax] |
4111 vmovdqu ymm1, [eax + 32] | 4111 vmovdqu ymm1, [eax + 32] |
4112 lea eax, [eax + 64] | 4112 lea eax, [eax + 64] |
4113 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV | 4113 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
4114 vpsrlw ymm1, ymm1, 8 | 4114 vpsrlw ymm1, ymm1, 8 |
(...skipping 12 matching lines...) Expand all Loading... |
4127 jg convertloop | 4127 jg convertloop |
4128 | 4128 |
4129 pop edi | 4129 pop edi |
4130 vzeroupper | 4130 vzeroupper |
4131 ret | 4131 ret |
4132 } | 4132 } |
4133 } | 4133 } |
4134 | 4134 |
4135 __declspec(naked) | 4135 __declspec(naked) |
4136 void UYVYToYRow_AVX2(const uint8* src_uyvy, | 4136 void UYVYToYRow_AVX2(const uint8* src_uyvy, |
4137 uint8* dst_y, int pix) { | 4137 uint8* dst_y, int width) { |
4138 __asm { | 4138 __asm { |
4139 mov eax, [esp + 4] // src_uyvy | 4139 mov eax, [esp + 4] // src_uyvy |
4140 mov edx, [esp + 8] // dst_y | 4140 mov edx, [esp + 8] // dst_y |
4141 mov ecx, [esp + 12] // pix | 4141 mov ecx, [esp + 12] // width |
4142 | 4142 |
4143 convertloop: | 4143 convertloop: |
4144 vmovdqu ymm0, [eax] | 4144 vmovdqu ymm0, [eax] |
4145 vmovdqu ymm1, [eax + 32] | 4145 vmovdqu ymm1, [eax + 32] |
4146 lea eax, [eax + 64] | 4146 lea eax, [eax + 64] |
4147 vpsrlw ymm0, ymm0, 8 // odd bytes are Y | 4147 vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
4148 vpsrlw ymm1, ymm1, 8 | 4148 vpsrlw ymm1, ymm1, 8 |
4149 vpackuswb ymm0, ymm0, ymm1 // mutates. | 4149 vpackuswb ymm0, ymm0, ymm1 // mutates. |
4150 vpermq ymm0, ymm0, 0xd8 | 4150 vpermq ymm0, ymm0, 0xd8 |
4151 vmovdqu [edx], ymm0 | 4151 vmovdqu [edx], ymm0 |
4152 lea edx, [edx + 32] | 4152 lea edx, [edx + 32] |
4153 sub ecx, 32 | 4153 sub ecx, 32 |
4154 jg convertloop | 4154 jg convertloop |
4155 vzeroupper | 4155 vzeroupper |
4156 ret | 4156 ret |
4157 } | 4157 } |
4158 } | 4158 } |
4159 | 4159 |
4160 __declspec(naked) | 4160 __declspec(naked) |
4161 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 4161 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
4162 uint8* dst_u, uint8* dst_v, int pix) { | 4162 uint8* dst_u, uint8* dst_v, int width) { |
4163 __asm { | 4163 __asm { |
4164 push esi | 4164 push esi |
4165 push edi | 4165 push edi |
4166 mov eax, [esp + 8 + 4] // src_yuy2 | 4166 mov eax, [esp + 8 + 4] // src_yuy2 |
4167 mov esi, [esp + 8 + 8] // stride_yuy2 | 4167 mov esi, [esp + 8 + 8] // stride_yuy2 |
4168 mov edx, [esp + 8 + 12] // dst_u | 4168 mov edx, [esp + 8 + 12] // dst_u |
4169 mov edi, [esp + 8 + 16] // dst_v | 4169 mov edi, [esp + 8 + 16] // dst_v |
4170 mov ecx, [esp + 8 + 20] // pix | 4170 mov ecx, [esp + 8 + 20] // width |
4171 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4171 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
4172 vpsrlw ymm5, ymm5, 8 | 4172 vpsrlw ymm5, ymm5, 8 |
4173 sub edi, edx | 4173 sub edi, edx |
4174 | 4174 |
4175 convertloop: | 4175 convertloop: |
4176 vmovdqu ymm0, [eax] | 4176 vmovdqu ymm0, [eax] |
4177 vmovdqu ymm1, [eax + 32] | 4177 vmovdqu ymm1, [eax + 32] |
4178 vpavgb ymm0, ymm0, [eax + esi] | 4178 vpavgb ymm0, ymm0, [eax + esi] |
4179 vpavgb ymm1, ymm1, [eax + esi + 32] | 4179 vpavgb ymm1, ymm1, [eax + esi + 32] |
4180 lea eax, [eax + 64] | 4180 lea eax, [eax + 64] |
(...skipping 15 matching lines...) Expand all Loading... |
4196 | 4196 |
4197 pop edi | 4197 pop edi |
4198 pop esi | 4198 pop esi |
4199 vzeroupper | 4199 vzeroupper |
4200 ret | 4200 ret |
4201 } | 4201 } |
4202 } | 4202 } |
4203 | 4203 |
4204 __declspec(naked) | 4204 __declspec(naked) |
4205 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 4205 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
4206 uint8* dst_u, uint8* dst_v, int pix) { | 4206 uint8* dst_u, uint8* dst_v, int width) { |
4207 __asm { | 4207 __asm { |
4208 push edi | 4208 push edi |
4209 mov eax, [esp + 4 + 4] // src_yuy2 | 4209 mov eax, [esp + 4 + 4] // src_yuy2 |
4210 mov edx, [esp + 4 + 8] // dst_u | 4210 mov edx, [esp + 4 + 8] // dst_u |
4211 mov edi, [esp + 4 + 12] // dst_v | 4211 mov edi, [esp + 4 + 12] // dst_v |
4212 mov ecx, [esp + 4 + 16] // pix | 4212 mov ecx, [esp + 4 + 16] // width |
4213 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4213 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
4214 vpsrlw ymm5, ymm5, 8 | 4214 vpsrlw ymm5, ymm5, 8 |
4215 sub edi, edx | 4215 sub edi, edx |
4216 | 4216 |
4217 convertloop: | 4217 convertloop: |
4218 vmovdqu ymm0, [eax] | 4218 vmovdqu ymm0, [eax] |
4219 vmovdqu ymm1, [eax + 32] | 4219 vmovdqu ymm1, [eax + 32] |
4220 lea eax, [eax + 64] | 4220 lea eax, [eax + 64] |
4221 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV | 4221 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
4222 vpand ymm1, ymm1, ymm5 | 4222 vpand ymm1, ymm1, ymm5 |
(...skipping 14 matching lines...) Expand all Loading... |
4237 pop edi | 4237 pop edi |
4238 vzeroupper | 4238 vzeroupper |
4239 ret | 4239 ret |
4240 } | 4240 } |
4241 } | 4241 } |
4242 #endif // HAS_YUY2TOYROW_AVX2 | 4242 #endif // HAS_YUY2TOYROW_AVX2 |
4243 | 4243 |
4244 #ifdef HAS_YUY2TOYROW_SSE2 | 4244 #ifdef HAS_YUY2TOYROW_SSE2 |
4245 __declspec(naked) | 4245 __declspec(naked) |
4246 void YUY2ToYRow_SSE2(const uint8* src_yuy2, | 4246 void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
4247 uint8* dst_y, int pix) { | 4247 uint8* dst_y, int width) { |
4248 __asm { | 4248 __asm { |
4249 mov eax, [esp + 4] // src_yuy2 | 4249 mov eax, [esp + 4] // src_yuy2 |
4250 mov edx, [esp + 8] // dst_y | 4250 mov edx, [esp + 8] // dst_y |
4251 mov ecx, [esp + 12] // pix | 4251 mov ecx, [esp + 12] // width |
4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
4253 psrlw xmm5, 8 | 4253 psrlw xmm5, 8 |
4254 | 4254 |
4255 convertloop: | 4255 convertloop: |
4256 movdqu xmm0, [eax] | 4256 movdqu xmm0, [eax] |
4257 movdqu xmm1, [eax + 16] | 4257 movdqu xmm1, [eax + 16] |
4258 lea eax, [eax + 32] | 4258 lea eax, [eax + 32] |
4259 pand xmm0, xmm5 // even bytes are Y | 4259 pand xmm0, xmm5 // even bytes are Y |
4260 pand xmm1, xmm5 | 4260 pand xmm1, xmm5 |
4261 packuswb xmm0, xmm1 | 4261 packuswb xmm0, xmm1 |
4262 movdqu [edx], xmm0 | 4262 movdqu [edx], xmm0 |
4263 lea edx, [edx + 16] | 4263 lea edx, [edx + 16] |
4264 sub ecx, 16 | 4264 sub ecx, 16 |
4265 jg convertloop | 4265 jg convertloop |
4266 ret | 4266 ret |
4267 } | 4267 } |
4268 } | 4268 } |
4269 | 4269 |
4270 __declspec(naked) | 4270 __declspec(naked) |
4271 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 4271 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
4272 uint8* dst_u, uint8* dst_v, int pix) { | 4272 uint8* dst_u, uint8* dst_v, int width) { |
4273 __asm { | 4273 __asm { |
4274 push esi | 4274 push esi |
4275 push edi | 4275 push edi |
4276 mov eax, [esp + 8 + 4] // src_yuy2 | 4276 mov eax, [esp + 8 + 4] // src_yuy2 |
4277 mov esi, [esp + 8 + 8] // stride_yuy2 | 4277 mov esi, [esp + 8 + 8] // stride_yuy2 |
4278 mov edx, [esp + 8 + 12] // dst_u | 4278 mov edx, [esp + 8 + 12] // dst_u |
4279 mov edi, [esp + 8 + 16] // dst_v | 4279 mov edi, [esp + 8 + 16] // dst_v |
4280 mov ecx, [esp + 8 + 20] // pix | 4280 mov ecx, [esp + 8 + 20] // width |
4281 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4281 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
4282 psrlw xmm5, 8 | 4282 psrlw xmm5, 8 |
4283 sub edi, edx | 4283 sub edi, edx |
4284 | 4284 |
4285 convertloop: | 4285 convertloop: |
4286 movdqu xmm0, [eax] | 4286 movdqu xmm0, [eax] |
4287 movdqu xmm1, [eax + 16] | 4287 movdqu xmm1, [eax + 16] |
4288 movdqu xmm2, [eax + esi] | 4288 movdqu xmm2, [eax + esi] |
4289 movdqu xmm3, [eax + esi + 16] | 4289 movdqu xmm3, [eax + esi + 16] |
4290 lea eax, [eax + 32] | 4290 lea eax, [eax + 32] |
(...skipping 14 matching lines...) Expand all Loading... |
4305 jg convertloop | 4305 jg convertloop |
4306 | 4306 |
4307 pop edi | 4307 pop edi |
4308 pop esi | 4308 pop esi |
4309 ret | 4309 ret |
4310 } | 4310 } |
4311 } | 4311 } |
4312 | 4312 |
4313 __declspec(naked) | 4313 __declspec(naked) |
4314 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 4314 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
4315 uint8* dst_u, uint8* dst_v, int pix) { | 4315 uint8* dst_u, uint8* dst_v, int width) { |
4316 __asm { | 4316 __asm { |
4317 push edi | 4317 push edi |
4318 mov eax, [esp + 4 + 4] // src_yuy2 | 4318 mov eax, [esp + 4 + 4] // src_yuy2 |
4319 mov edx, [esp + 4 + 8] // dst_u | 4319 mov edx, [esp + 4 + 8] // dst_u |
4320 mov edi, [esp + 4 + 12] // dst_v | 4320 mov edi, [esp + 4 + 12] // dst_v |
4321 mov ecx, [esp + 4 + 16] // pix | 4321 mov ecx, [esp + 4 + 16] // width |
4322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
4323 psrlw xmm5, 8 | 4323 psrlw xmm5, 8 |
4324 sub edi, edx | 4324 sub edi, edx |
4325 | 4325 |
4326 convertloop: | 4326 convertloop: |
4327 movdqu xmm0, [eax] | 4327 movdqu xmm0, [eax] |
4328 movdqu xmm1, [eax + 16] | 4328 movdqu xmm1, [eax + 16] |
4329 lea eax, [eax + 32] | 4329 lea eax, [eax + 32] |
4330 psrlw xmm0, 8 // YUYV -> UVUV | 4330 psrlw xmm0, 8 // YUYV -> UVUV |
4331 psrlw xmm1, 8 | 4331 psrlw xmm1, 8 |
4332 packuswb xmm0, xmm1 | 4332 packuswb xmm0, xmm1 |
4333 movdqa xmm1, xmm0 | 4333 movdqa xmm1, xmm0 |
4334 pand xmm0, xmm5 // U | 4334 pand xmm0, xmm5 // U |
4335 packuswb xmm0, xmm0 | 4335 packuswb xmm0, xmm0 |
4336 psrlw xmm1, 8 // V | 4336 psrlw xmm1, 8 // V |
4337 packuswb xmm1, xmm1 | 4337 packuswb xmm1, xmm1 |
4338 movq qword ptr [edx], xmm0 | 4338 movq qword ptr [edx], xmm0 |
4339 movq qword ptr [edx + edi], xmm1 | 4339 movq qword ptr [edx + edi], xmm1 |
4340 lea edx, [edx + 8] | 4340 lea edx, [edx + 8] |
4341 sub ecx, 16 | 4341 sub ecx, 16 |
4342 jg convertloop | 4342 jg convertloop |
4343 | 4343 |
4344 pop edi | 4344 pop edi |
4345 ret | 4345 ret |
4346 } | 4346 } |
4347 } | 4347 } |
4348 | 4348 |
4349 __declspec(naked) | 4349 __declspec(naked) |
4350 void UYVYToYRow_SSE2(const uint8* src_uyvy, | 4350 void UYVYToYRow_SSE2(const uint8* src_uyvy, |
4351 uint8* dst_y, int pix) { | 4351 uint8* dst_y, int width) { |
4352 __asm { | 4352 __asm { |
4353 mov eax, [esp + 4] // src_uyvy | 4353 mov eax, [esp + 4] // src_uyvy |
4354 mov edx, [esp + 8] // dst_y | 4354 mov edx, [esp + 8] // dst_y |
4355 mov ecx, [esp + 12] // pix | 4355 mov ecx, [esp + 12] // width |
4356 | 4356 |
4357 convertloop: | 4357 convertloop: |
4358 movdqu xmm0, [eax] | 4358 movdqu xmm0, [eax] |
4359 movdqu xmm1, [eax + 16] | 4359 movdqu xmm1, [eax + 16] |
4360 lea eax, [eax + 32] | 4360 lea eax, [eax + 32] |
4361 psrlw xmm0, 8 // odd bytes are Y | 4361 psrlw xmm0, 8 // odd bytes are Y |
4362 psrlw xmm1, 8 | 4362 psrlw xmm1, 8 |
4363 packuswb xmm0, xmm1 | 4363 packuswb xmm0, xmm1 |
4364 movdqu [edx], xmm0 | 4364 movdqu [edx], xmm0 |
4365 lea edx, [edx + 16] | 4365 lea edx, [edx + 16] |
4366 sub ecx, 16 | 4366 sub ecx, 16 |
4367 jg convertloop | 4367 jg convertloop |
4368 ret | 4368 ret |
4369 } | 4369 } |
4370 } | 4370 } |
4371 | 4371 |
4372 __declspec(naked) | 4372 __declspec(naked) |
4373 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 4373 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
4374 uint8* dst_u, uint8* dst_v, int pix) { | 4374 uint8* dst_u, uint8* dst_v, int width) { |
4375 __asm { | 4375 __asm { |
4376 push esi | 4376 push esi |
4377 push edi | 4377 push edi |
4378 mov eax, [esp + 8 + 4] // src_yuy2 | 4378 mov eax, [esp + 8 + 4] // src_yuy2 |
4379 mov esi, [esp + 8 + 8] // stride_yuy2 | 4379 mov esi, [esp + 8 + 8] // stride_yuy2 |
4380 mov edx, [esp + 8 + 12] // dst_u | 4380 mov edx, [esp + 8 + 12] // dst_u |
4381 mov edi, [esp + 8 + 16] // dst_v | 4381 mov edi, [esp + 8 + 16] // dst_v |
4382 mov ecx, [esp + 8 + 20] // pix | 4382 mov ecx, [esp + 8 + 20] // width |
4383 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4383 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
4384 psrlw xmm5, 8 | 4384 psrlw xmm5, 8 |
4385 sub edi, edx | 4385 sub edi, edx |
4386 | 4386 |
4387 convertloop: | 4387 convertloop: |
4388 movdqu xmm0, [eax] | 4388 movdqu xmm0, [eax] |
4389 movdqu xmm1, [eax + 16] | 4389 movdqu xmm1, [eax + 16] |
4390 movdqu xmm2, [eax + esi] | 4390 movdqu xmm2, [eax + esi] |
4391 movdqu xmm3, [eax + esi + 16] | 4391 movdqu xmm3, [eax + esi + 16] |
4392 lea eax, [eax + 32] | 4392 lea eax, [eax + 32] |
(...skipping 14 matching lines...) Expand all Loading... |
4407 jg convertloop | 4407 jg convertloop |
4408 | 4408 |
4409 pop edi | 4409 pop edi |
4410 pop esi | 4410 pop esi |
4411 ret | 4411 ret |
4412 } | 4412 } |
4413 } | 4413 } |
4414 | 4414 |
4415 __declspec(naked) | 4415 __declspec(naked) |
4416 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 4416 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
4417 uint8* dst_u, uint8* dst_v, int pix) { | 4417 uint8* dst_u, uint8* dst_v, int width) { |
4418 __asm { | 4418 __asm { |
4419 push edi | 4419 push edi |
4420 mov eax, [esp + 4 + 4] // src_yuy2 | 4420 mov eax, [esp + 4 + 4] // src_yuy2 |
4421 mov edx, [esp + 4 + 8] // dst_u | 4421 mov edx, [esp + 4 + 8] // dst_u |
4422 mov edi, [esp + 4 + 12] // dst_v | 4422 mov edi, [esp + 4 + 12] // dst_v |
4423 mov ecx, [esp + 4 + 16] // pix | 4423 mov ecx, [esp + 4 + 16] // width |
4424 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4424 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
4425 psrlw xmm5, 8 | 4425 psrlw xmm5, 8 |
4426 sub edi, edx | 4426 sub edi, edx |
4427 | 4427 |
4428 convertloop: | 4428 convertloop: |
4429 movdqu xmm0, [eax] | 4429 movdqu xmm0, [eax] |
4430 movdqu xmm1, [eax + 16] | 4430 movdqu xmm1, [eax + 16] |
4431 lea eax, [eax + 32] | 4431 lea eax, [eax + 32] |
4432 pand xmm0, xmm5 // UYVY -> UVUV | 4432 pand xmm0, xmm5 // UYVY -> UVUV |
4433 pand xmm1, xmm5 | 4433 pand xmm1, xmm5 |
(...skipping 1685 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6119 pop edi | 6119 pop edi |
6120 pop esi | 6120 pop esi |
6121 ret | 6121 ret |
6122 } | 6122 } |
6123 } | 6123 } |
6124 #endif // HAS_INTERPOLATEROW_SSE2 | 6124 #endif // HAS_INTERPOLATEROW_SSE2 |
6125 | 6125 |
6126 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 6126 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
6127 __declspec(naked) | 6127 __declspec(naked) |
6128 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 6128 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
6129 const uint8* shuffler, int pix) { | 6129 const uint8* shuffler, int width) { |
6130 __asm { | 6130 __asm { |
6131 mov eax, [esp + 4] // src_argb | 6131 mov eax, [esp + 4] // src_argb |
6132 mov edx, [esp + 8] // dst_argb | 6132 mov edx, [esp + 8] // dst_argb |
6133 mov ecx, [esp + 12] // shuffler | 6133 mov ecx, [esp + 12] // shuffler |
6134 movdqu xmm5, [ecx] | 6134 movdqu xmm5, [ecx] |
6135 mov ecx, [esp + 16] // pix | 6135 mov ecx, [esp + 16] // width |
6136 | 6136 |
6137 wloop: | 6137 wloop: |
6138 movdqu xmm0, [eax] | 6138 movdqu xmm0, [eax] |
6139 movdqu xmm1, [eax + 16] | 6139 movdqu xmm1, [eax + 16] |
6140 lea eax, [eax + 32] | 6140 lea eax, [eax + 32] |
6141 pshufb xmm0, xmm5 | 6141 pshufb xmm0, xmm5 |
6142 pshufb xmm1, xmm5 | 6142 pshufb xmm1, xmm5 |
6143 movdqu [edx], xmm0 | 6143 movdqu [edx], xmm0 |
6144 movdqu [edx + 16], xmm1 | 6144 movdqu [edx + 16], xmm1 |
6145 lea edx, [edx + 32] | 6145 lea edx, [edx + 32] |
6146 sub ecx, 8 | 6146 sub ecx, 8 |
6147 jg wloop | 6147 jg wloop |
6148 ret | 6148 ret |
6149 } | 6149 } |
6150 } | 6150 } |
6151 | 6151 |
6152 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 6152 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
6153 __declspec(naked) | 6153 __declspec(naked) |
6154 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 6154 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
6155 const uint8* shuffler, int pix) { | 6155 const uint8* shuffler, int width) { |
6156 __asm { | 6156 __asm { |
6157 mov eax, [esp + 4] // src_argb | 6157 mov eax, [esp + 4] // src_argb |
6158 mov edx, [esp + 8] // dst_argb | 6158 mov edx, [esp + 8] // dst_argb |
6159 mov ecx, [esp + 12] // shuffler | 6159 mov ecx, [esp + 12] // shuffler |
6160 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. | 6160 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
6161 mov ecx, [esp + 16] // pix | 6161 mov ecx, [esp + 16] // width |
6162 | 6162 |
6163 wloop: | 6163 wloop: |
6164 vmovdqu ymm0, [eax] | 6164 vmovdqu ymm0, [eax] |
6165 vmovdqu ymm1, [eax + 32] | 6165 vmovdqu ymm1, [eax + 32] |
6166 lea eax, [eax + 64] | 6166 lea eax, [eax + 64] |
6167 vpshufb ymm0, ymm0, ymm5 | 6167 vpshufb ymm0, ymm0, ymm5 |
6168 vpshufb ymm1, ymm1, ymm5 | 6168 vpshufb ymm1, ymm1, ymm5 |
6169 vmovdqu [edx], ymm0 | 6169 vmovdqu [edx], ymm0 |
6170 vmovdqu [edx + 32], ymm1 | 6170 vmovdqu [edx + 32], ymm1 |
6171 lea edx, [edx + 64] | 6171 lea edx, [edx + 64] |
6172 sub ecx, 16 | 6172 sub ecx, 16 |
6173 jg wloop | 6173 jg wloop |
6174 | 6174 |
6175 vzeroupper | 6175 vzeroupper |
6176 ret | 6176 ret |
6177 } | 6177 } |
6178 } | 6178 } |
6179 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 6179 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
6180 | 6180 |
6181 __declspec(naked) | 6181 __declspec(naked) |
6182 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 6182 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
6183 const uint8* shuffler, int pix) { | 6183 const uint8* shuffler, int width) { |
6184 __asm { | 6184 __asm { |
6185 push ebx | 6185 push ebx |
6186 push esi | 6186 push esi |
6187 mov eax, [esp + 8 + 4] // src_argb | 6187 mov eax, [esp + 8 + 4] // src_argb |
6188 mov edx, [esp + 8 + 8] // dst_argb | 6188 mov edx, [esp + 8 + 8] // dst_argb |
6189 mov esi, [esp + 8 + 12] // shuffler | 6189 mov esi, [esp + 8 + 12] // shuffler |
6190 mov ecx, [esp + 8 + 16] // pix | 6190 mov ecx, [esp + 8 + 16] // width |
6191 pxor xmm5, xmm5 | 6191 pxor xmm5, xmm5 |
6192 | 6192 |
6193 mov ebx, [esi] // shuffler | 6193 mov ebx, [esi] // shuffler |
6194 cmp ebx, 0x03000102 | 6194 cmp ebx, 0x03000102 |
6195 je shuf_3012 | 6195 je shuf_3012 |
6196 cmp ebx, 0x00010203 | 6196 cmp ebx, 0x00010203 |
6197 je shuf_0123 | 6197 je shuf_0123 |
6198 cmp ebx, 0x00030201 | 6198 cmp ebx, 0x00030201 |
6199 je shuf_0321 | 6199 je shuf_0321 |
6200 cmp ebx, 0x02010003 | 6200 cmp ebx, 0x02010003 |
(...skipping 435 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6636 } | 6636 } |
6637 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6637 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6638 | 6638 |
6639 #endif // defined(_M_X64) | 6639 #endif // defined(_M_X64) |
6640 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6640 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6641 | 6641 |
6642 #ifdef __cplusplus | 6642 #ifdef __cplusplus |
6643 } // extern "C" | 6643 } // extern "C" |
6644 } // namespace libyuv | 6644 } // namespace libyuv |
6645 #endif | 6645 #endif |
OLD | NEW |