| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 309 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 320 }; | 320 }; |
| 321 | 321 |
| 322 // NV21 shuf 8 VU to 16 UV. | 322 // NV21 shuf 8 VU to 16 UV. |
| 323 static const lvec8 kShuffleNV21 = { | 323 static const lvec8 kShuffleNV21 = { |
| 324 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 324 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 325 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 325 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 326 }; | 326 }; |
| 327 | 327 |
| 328 // Duplicates gray value 3 times and fills in alpha opaque. | 328 // Duplicates gray value 3 times and fills in alpha opaque. |
| 329 __declspec(naked) | 329 __declspec(naked) |
| 330 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 330 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { |
| 331 __asm { | 331 __asm { |
| 332 mov eax, [esp + 4] // src_y | 332 mov eax, [esp + 4] // src_y |
| 333 mov edx, [esp + 8] // dst_argb | 333 mov edx, [esp + 8] // dst_argb |
| 334 mov ecx, [esp + 12] // pix | 334 mov ecx, [esp + 12] // width |
| 335 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 335 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 336 pslld xmm5, 24 | 336 pslld xmm5, 24 |
| 337 | 337 |
| 338 convertloop: | 338 convertloop: |
| 339 movq xmm0, qword ptr [eax] | 339 movq xmm0, qword ptr [eax] |
| 340 lea eax, [eax + 8] | 340 lea eax, [eax + 8] |
| 341 punpcklbw xmm0, xmm0 | 341 punpcklbw xmm0, xmm0 |
| 342 movdqa xmm1, xmm0 | 342 movdqa xmm1, xmm0 |
| 343 punpcklwd xmm0, xmm0 | 343 punpcklwd xmm0, xmm0 |
| 344 punpckhwd xmm1, xmm1 | 344 punpckhwd xmm1, xmm1 |
| 345 por xmm0, xmm5 | 345 por xmm0, xmm5 |
| 346 por xmm1, xmm5 | 346 por xmm1, xmm5 |
| 347 movdqu [edx], xmm0 | 347 movdqu [edx], xmm0 |
| 348 movdqu [edx + 16], xmm1 | 348 movdqu [edx + 16], xmm1 |
| 349 lea edx, [edx + 32] | 349 lea edx, [edx + 32] |
| 350 sub ecx, 8 | 350 sub ecx, 8 |
| 351 jg convertloop | 351 jg convertloop |
| 352 ret | 352 ret |
| 353 } | 353 } |
| 354 } | 354 } |
| 355 | 355 |
| 356 #ifdef HAS_J400TOARGBROW_AVX2 | 356 #ifdef HAS_J400TOARGBROW_AVX2 |
| 357 // Duplicates gray value 3 times and fills in alpha opaque. | 357 // Duplicates gray value 3 times and fills in alpha opaque. |
| 358 __declspec(naked) | 358 __declspec(naked) |
| 359 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { | 359 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { |
| 360 __asm { | 360 __asm { |
| 361 mov eax, [esp + 4] // src_y | 361 mov eax, [esp + 4] // src_y |
| 362 mov edx, [esp + 8] // dst_argb | 362 mov edx, [esp + 8] // dst_argb |
| 363 mov ecx, [esp + 12] // pix | 363 mov ecx, [esp + 12] // width |
| 364 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 | 364 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| 365 vpslld ymm5, ymm5, 24 | 365 vpslld ymm5, ymm5, 24 |
| 366 | 366 |
| 367 convertloop: | 367 convertloop: |
| 368 vmovdqu xmm0, [eax] | 368 vmovdqu xmm0, [eax] |
| 369 lea eax, [eax + 16] | 369 lea eax, [eax + 16] |
| 370 vpermq ymm0, ymm0, 0xd8 | 370 vpermq ymm0, ymm0, 0xd8 |
| 371 vpunpcklbw ymm0, ymm0, ymm0 | 371 vpunpcklbw ymm0, ymm0, ymm0 |
| 372 vpermq ymm0, ymm0, 0xd8 | 372 vpermq ymm0, ymm0, 0xd8 |
| 373 vpunpckhwd ymm1, ymm0, ymm0 | 373 vpunpckhwd ymm1, ymm0, ymm0 |
| 374 vpunpcklwd ymm0, ymm0, ymm0 | 374 vpunpcklwd ymm0, ymm0, ymm0 |
| 375 vpor ymm0, ymm0, ymm5 | 375 vpor ymm0, ymm0, ymm5 |
| 376 vpor ymm1, ymm1, ymm5 | 376 vpor ymm1, ymm1, ymm5 |
| 377 vmovdqu [edx], ymm0 | 377 vmovdqu [edx], ymm0 |
| 378 vmovdqu [edx + 32], ymm1 | 378 vmovdqu [edx + 32], ymm1 |
| 379 lea edx, [edx + 64] | 379 lea edx, [edx + 64] |
| 380 sub ecx, 16 | 380 sub ecx, 16 |
| 381 jg convertloop | 381 jg convertloop |
| 382 vzeroupper | 382 vzeroupper |
| 383 ret | 383 ret |
| 384 } | 384 } |
| 385 } | 385 } |
| 386 #endif // HAS_J400TOARGBROW_AVX2 | 386 #endif // HAS_J400TOARGBROW_AVX2 |
| 387 | 387 |
| 388 __declspec(naked) | 388 __declspec(naked) |
| 389 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 389 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { |
| 390 __asm { | 390 __asm { |
| 391 mov eax, [esp + 4] // src_rgb24 | 391 mov eax, [esp + 4] // src_rgb24 |
| 392 mov edx, [esp + 8] // dst_argb | 392 mov edx, [esp + 8] // dst_argb |
| 393 mov ecx, [esp + 12] // pix | 393 mov ecx, [esp + 12] // width |
| 394 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 394 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 395 pslld xmm5, 24 | 395 pslld xmm5, 24 |
| 396 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB | 396 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB |
| 397 | 397 |
| 398 convertloop: | 398 convertloop: |
| 399 movdqu xmm0, [eax] | 399 movdqu xmm0, [eax] |
| 400 movdqu xmm1, [eax + 16] | 400 movdqu xmm1, [eax + 16] |
| 401 movdqu xmm3, [eax + 32] | 401 movdqu xmm3, [eax + 32] |
| 402 lea eax, [eax + 48] | 402 lea eax, [eax + 48] |
| 403 movdqa xmm2, xmm3 | 403 movdqa xmm2, xmm3 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 418 movdqu [edx + 48], xmm3 | 418 movdqu [edx + 48], xmm3 |
| 419 lea edx, [edx + 64] | 419 lea edx, [edx + 64] |
| 420 sub ecx, 16 | 420 sub ecx, 16 |
| 421 jg convertloop | 421 jg convertloop |
| 422 ret | 422 ret |
| 423 } | 423 } |
| 424 } | 424 } |
| 425 | 425 |
| 426 __declspec(naked) | 426 __declspec(naked) |
| 427 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, | 427 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
| 428 int pix) { | 428 int width) { |
| 429 __asm { | 429 __asm { |
| 430 mov eax, [esp + 4] // src_raw | 430 mov eax, [esp + 4] // src_raw |
| 431 mov edx, [esp + 8] // dst_argb | 431 mov edx, [esp + 8] // dst_argb |
| 432 mov ecx, [esp + 12] // pix | 432 mov ecx, [esp + 12] // width |
| 433 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 | 433 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 434 pslld xmm5, 24 | 434 pslld xmm5, 24 |
| 435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB | 435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB |
| 436 | 436 |
| 437 convertloop: | 437 convertloop: |
| 438 movdqu xmm0, [eax] | 438 movdqu xmm0, [eax] |
| 439 movdqu xmm1, [eax + 16] | 439 movdqu xmm1, [eax + 16] |
| 440 movdqu xmm3, [eax + 32] | 440 movdqu xmm3, [eax + 32] |
| 441 lea eax, [eax + 48] | 441 lea eax, [eax + 48] |
| 442 movdqa xmm2, xmm3 | 442 movdqa xmm2, xmm3 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 464 | 464 |
| 465 // pmul method to replicate bits. | 465 // pmul method to replicate bits. |
| 466 // Math to replicate bits: | 466 // Math to replicate bits: |
| 467 // (v << 8) | (v << 3) | 467 // (v << 8) | (v << 3) |
| 468 // v * 256 + v * 8 | 468 // v * 256 + v * 8 |
| 469 // v * (256 + 8) | 469 // v * (256 + 8) |
| 470 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 470 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| 471 // 20 instructions. | 471 // 20 instructions. |
| 472 __declspec(naked) | 472 __declspec(naked) |
| 473 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, | 473 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
| 474 int pix) { | 474 int width) { |
| 475 __asm { | 475 __asm { |
| 476 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 476 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 477 movd xmm5, eax | 477 movd xmm5, eax |
| 478 pshufd xmm5, xmm5, 0 | 478 pshufd xmm5, xmm5, 0 |
| 479 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 479 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| 480 movd xmm6, eax | 480 movd xmm6, eax |
| 481 pshufd xmm6, xmm6, 0 | 481 pshufd xmm6, xmm6, 0 |
| 482 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 482 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| 483 psllw xmm3, 11 | 483 psllw xmm3, 11 |
| 484 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green | 484 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
| 485 psllw xmm4, 10 | 485 psllw xmm4, 10 |
| 486 psrlw xmm4, 5 | 486 psrlw xmm4, 5 |
| 487 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha | 487 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| 488 psllw xmm7, 8 | 488 psllw xmm7, 8 |
| 489 | 489 |
| 490 mov eax, [esp + 4] // src_rgb565 | 490 mov eax, [esp + 4] // src_rgb565 |
| 491 mov edx, [esp + 8] // dst_argb | 491 mov edx, [esp + 8] // dst_argb |
| 492 mov ecx, [esp + 12] // pix | 492 mov ecx, [esp + 12] // width |
| 493 sub edx, eax | 493 sub edx, eax |
| 494 sub edx, eax | 494 sub edx, eax |
| 495 | 495 |
| 496 convertloop: | 496 convertloop: |
| 497 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 | 497 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
| 498 movdqa xmm1, xmm0 | 498 movdqa xmm1, xmm0 |
| 499 movdqa xmm2, xmm0 | 499 movdqa xmm2, xmm0 |
| 500 pand xmm1, xmm3 // R in upper 5 bits | 500 pand xmm1, xmm3 // R in upper 5 bits |
| 501 psllw xmm2, 11 // B in upper 5 bits | 501 psllw xmm2, 11 // B in upper 5 bits |
| 502 pmulhuw xmm1, xmm5 // * (256 + 8) | 502 pmulhuw xmm1, xmm5 // * (256 + 8) |
| (...skipping 17 matching lines...) Expand all Loading... |
| 520 | 520 |
| 521 #ifdef HAS_RGB565TOARGBROW_AVX2 | 521 #ifdef HAS_RGB565TOARGBROW_AVX2 |
| 522 // pmul method to replicate bits. | 522 // pmul method to replicate bits. |
| 523 // Math to replicate bits: | 523 // Math to replicate bits: |
| 524 // (v << 8) | (v << 3) | 524 // (v << 8) | (v << 3) |
| 525 // v * 256 + v * 8 | 525 // v * 256 + v * 8 |
| 526 // v * (256 + 8) | 526 // v * (256 + 8) |
| 527 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 | 527 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| 528 __declspec(naked) | 528 __declspec(naked) |
| 529 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, | 529 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, |
| 530 int pix) { | 530 int width) { |
| 531 __asm { | 531 __asm { |
| 532 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 532 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 533 vmovd xmm5, eax | 533 vmovd xmm5, eax |
| 534 vbroadcastss ymm5, xmm5 | 534 vbroadcastss ymm5, xmm5 |
| 535 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits | 535 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| 536 movd xmm6, eax | 536 movd xmm6, eax |
| 537 vbroadcastss ymm6, xmm6 | 537 vbroadcastss ymm6, xmm6 |
| 538 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 538 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| 539 vpsllw ymm3, ymm3, 11 | 539 vpsllw ymm3, ymm3, 11 |
| 540 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green | 540 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green |
| 541 vpsllw ymm4, ymm4, 10 | 541 vpsllw ymm4, ymm4, 10 |
| 542 vpsrlw ymm4, ymm4, 5 | 542 vpsrlw ymm4, ymm4, 5 |
| 543 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 543 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| 544 vpsllw ymm7, ymm7, 8 | 544 vpsllw ymm7, ymm7, 8 |
| 545 | 545 |
| 546 mov eax, [esp + 4] // src_rgb565 | 546 mov eax, [esp + 4] // src_rgb565 |
| 547 mov edx, [esp + 8] // dst_argb | 547 mov edx, [esp + 8] // dst_argb |
| 548 mov ecx, [esp + 12] // pix | 548 mov ecx, [esp + 12] // width |
| 549 sub edx, eax | 549 sub edx, eax |
| 550 sub edx, eax | 550 sub edx, eax |
| 551 | 551 |
| 552 convertloop: | 552 convertloop: |
| 553 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 | 553 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 |
| 554 vpand ymm1, ymm0, ymm3 // R in upper 5 bits | 554 vpand ymm1, ymm0, ymm3 // R in upper 5 bits |
| 555 vpsllw ymm2, ymm0, 11 // B in upper 5 bits | 555 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| 556 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) | 556 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| 557 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) | 557 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| 558 vpsllw ymm1, ymm1, 8 | 558 vpsllw ymm1, ymm1, 8 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 571 jg convertloop | 571 jg convertloop |
| 572 vzeroupper | 572 vzeroupper |
| 573 ret | 573 ret |
| 574 } | 574 } |
| 575 } | 575 } |
| 576 #endif // HAS_RGB565TOARGBROW_AVX2 | 576 #endif // HAS_RGB565TOARGBROW_AVX2 |
| 577 | 577 |
| 578 #ifdef HAS_ARGB1555TOARGBROW_AVX2 | 578 #ifdef HAS_ARGB1555TOARGBROW_AVX2 |
| 579 __declspec(naked) | 579 __declspec(naked) |
| 580 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, | 580 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, |
| 581 int pix) { | 581 int width) { |
| 582 __asm { | 582 __asm { |
| 583 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 583 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 584 vmovd xmm5, eax | 584 vmovd xmm5, eax |
| 585 vbroadcastss ymm5, xmm5 | 585 vbroadcastss ymm5, xmm5 |
| 586 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 586 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| 587 movd xmm6, eax | 587 movd xmm6, eax |
| 588 vbroadcastss ymm6, xmm6 | 588 vbroadcastss ymm6, xmm6 |
| 589 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red | 589 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red |
| 590 vpsllw ymm3, ymm3, 11 | 590 vpsllw ymm3, ymm3, 11 |
| 591 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green | 591 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green |
| 592 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha | 592 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha |
| 593 vpsllw ymm7, ymm7, 8 | 593 vpsllw ymm7, ymm7, 8 |
| 594 | 594 |
| 595 mov eax, [esp + 4] // src_argb1555 | 595 mov eax, [esp + 4] // src_argb1555 |
| 596 mov edx, [esp + 8] // dst_argb | 596 mov edx, [esp + 8] // dst_argb |
| 597 mov ecx, [esp + 12] // pix | 597 mov ecx, [esp + 12] // width |
| 598 sub edx, eax | 598 sub edx, eax |
| 599 sub edx, eax | 599 sub edx, eax |
| 600 | 600 |
| 601 convertloop: | 601 convertloop: |
| 602 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 | 602 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 |
| 603 vpsllw ymm1, ymm0, 1 // R in upper 5 bits | 603 vpsllw ymm1, ymm0, 1 // R in upper 5 bits |
| 604 vpsllw ymm2, ymm0, 11 // B in upper 5 bits | 604 vpsllw ymm2, ymm0, 11 // B in upper 5 bits |
| 605 vpand ymm1, ymm1, ymm3 | 605 vpand ymm1, ymm1, ymm3 |
| 606 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) | 606 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) |
| 607 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) | 607 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) |
| (...skipping 15 matching lines...) Expand all Loading... |
| 623 jg convertloop | 623 jg convertloop |
| 624 vzeroupper | 624 vzeroupper |
| 625 ret | 625 ret |
| 626 } | 626 } |
| 627 } | 627 } |
| 628 #endif // HAS_ARGB1555TOARGBROW_AVX2 | 628 #endif // HAS_ARGB1555TOARGBROW_AVX2 |
| 629 | 629 |
| 630 #ifdef HAS_ARGB4444TOARGBROW_AVX2 | 630 #ifdef HAS_ARGB4444TOARGBROW_AVX2 |
| 631 __declspec(naked) | 631 __declspec(naked) |
| 632 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, | 632 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, |
| 633 int pix) { | 633 int width) { |
| 634 __asm { | 634 __asm { |
| 635 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f | 635 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| 636 vmovd xmm4, eax | 636 vmovd xmm4, eax |
| 637 vbroadcastss ymm4, xmm4 | 637 vbroadcastss ymm4, xmm4 |
| 638 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles | 638 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles |
| 639 mov eax, [esp + 4] // src_argb4444 | 639 mov eax, [esp + 4] // src_argb4444 |
| 640 mov edx, [esp + 8] // dst_argb | 640 mov edx, [esp + 8] // dst_argb |
| 641 mov ecx, [esp + 12] // pix | 641 mov ecx, [esp + 12] // width |
| 642 sub edx, eax | 642 sub edx, eax |
| 643 sub edx, eax | 643 sub edx, eax |
| 644 | 644 |
| 645 convertloop: | 645 convertloop: |
| 646 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 | 646 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 |
| 647 vpand ymm2, ymm0, ymm5 // mask high nibbles | 647 vpand ymm2, ymm0, ymm5 // mask high nibbles |
| 648 vpand ymm0, ymm0, ymm4 // mask low nibbles | 648 vpand ymm0, ymm0, ymm4 // mask low nibbles |
| 649 vpsrlw ymm3, ymm2, 4 | 649 vpsrlw ymm3, ymm2, 4 |
| 650 vpsllw ymm1, ymm0, 4 | 650 vpsllw ymm1, ymm0, 4 |
| 651 vpor ymm2, ymm2, ymm3 | 651 vpor ymm2, ymm2, ymm3 |
| 652 vpor ymm0, ymm0, ymm1 | 652 vpor ymm0, ymm0, ymm1 |
| 653 vpermq ymm0, ymm0, 0xd8 // mutate for unpack | 653 vpermq ymm0, ymm0, 0xd8 // mutate for unpack |
| 654 vpermq ymm2, ymm2, 0xd8 | 654 vpermq ymm2, ymm2, 0xd8 |
| 655 vpunpckhbw ymm1, ymm0, ymm2 | 655 vpunpckhbw ymm1, ymm0, ymm2 |
| 656 vpunpcklbw ymm0, ymm0, ymm2 | 656 vpunpcklbw ymm0, ymm0, ymm2 |
| 657 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB | 657 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB |
| 658 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB | 658 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB |
| 659 lea eax, [eax + 32] | 659 lea eax, [eax + 32] |
| 660 sub ecx, 16 | 660 sub ecx, 16 |
| 661 jg convertloop | 661 jg convertloop |
| 662 vzeroupper | 662 vzeroupper |
| 663 ret | 663 ret |
| 664 } | 664 } |
| 665 } | 665 } |
| 666 #endif // HAS_ARGB4444TOARGBROW_AVX2 | 666 #endif // HAS_ARGB4444TOARGBROW_AVX2 |
| 667 | 667 |
| 668 // 24 instructions | 668 // 24 instructions |
| 669 __declspec(naked) | 669 __declspec(naked) |
| 670 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, | 670 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
| 671 int pix) { | 671 int width) { |
| 672 __asm { | 672 __asm { |
| 673 mov eax, 0x01080108 // generate multiplier to repeat 5 bits | 673 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 674 movd xmm5, eax | 674 movd xmm5, eax |
| 675 pshufd xmm5, xmm5, 0 | 675 pshufd xmm5, xmm5, 0 |
| 676 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits | 676 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| 677 movd xmm6, eax | 677 movd xmm6, eax |
| 678 pshufd xmm6, xmm6, 0 | 678 pshufd xmm6, xmm6, 0 |
| 679 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red | 679 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| 680 psllw xmm3, 11 | 680 psllw xmm3, 11 |
| 681 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green | 681 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
| 682 psrlw xmm4, 6 | 682 psrlw xmm4, 6 |
| 683 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha | 683 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| 684 psllw xmm7, 8 | 684 psllw xmm7, 8 |
| 685 | 685 |
| 686 mov eax, [esp + 4] // src_argb1555 | 686 mov eax, [esp + 4] // src_argb1555 |
| 687 mov edx, [esp + 8] // dst_argb | 687 mov edx, [esp + 8] // dst_argb |
| 688 mov ecx, [esp + 12] // pix | 688 mov ecx, [esp + 12] // width |
| 689 sub edx, eax | 689 sub edx, eax |
| 690 sub edx, eax | 690 sub edx, eax |
| 691 | 691 |
| 692 convertloop: | 692 convertloop: |
| 693 movdqu xmm0, [eax] // fetch 8 pixels of 1555 | 693 movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
| 694 movdqa xmm1, xmm0 | 694 movdqa xmm1, xmm0 |
| 695 movdqa xmm2, xmm0 | 695 movdqa xmm2, xmm0 |
| 696 psllw xmm1, 1 // R in upper 5 bits | 696 psllw xmm1, 1 // R in upper 5 bits |
| 697 psllw xmm2, 11 // B in upper 5 bits | 697 psllw xmm2, 11 // B in upper 5 bits |
| 698 pand xmm1, xmm3 | 698 pand xmm1, xmm3 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 714 lea eax, [eax + 16] | 714 lea eax, [eax + 16] |
| 715 sub ecx, 8 | 715 sub ecx, 8 |
| 716 jg convertloop | 716 jg convertloop |
| 717 ret | 717 ret |
| 718 } | 718 } |
| 719 } | 719 } |
| 720 | 720 |
| 721 // 18 instructions. | 721 // 18 instructions. |
| 722 __declspec(naked) | 722 __declspec(naked) |
| 723 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, | 723 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
| 724 int pix) { | 724 int width) { |
| 725 __asm { | 725 __asm { |
| 726 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f | 726 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| 727 movd xmm4, eax | 727 movd xmm4, eax |
| 728 pshufd xmm4, xmm4, 0 | 728 pshufd xmm4, xmm4, 0 |
| 729 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles | 729 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
| 730 pslld xmm5, 4 | 730 pslld xmm5, 4 |
| 731 mov eax, [esp + 4] // src_argb4444 | 731 mov eax, [esp + 4] // src_argb4444 |
| 732 mov edx, [esp + 8] // dst_argb | 732 mov edx, [esp + 8] // dst_argb |
| 733 mov ecx, [esp + 12] // pix | 733 mov ecx, [esp + 12] // width |
| 734 sub edx, eax | 734 sub edx, eax |
| 735 sub edx, eax | 735 sub edx, eax |
| 736 | 736 |
| 737 convertloop: | 737 convertloop: |
| 738 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 | 738 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
| 739 movdqa xmm2, xmm0 | 739 movdqa xmm2, xmm0 |
| 740 pand xmm0, xmm4 // mask low nibbles | 740 pand xmm0, xmm4 // mask low nibbles |
| 741 pand xmm2, xmm5 // mask high nibbles | 741 pand xmm2, xmm5 // mask high nibbles |
| 742 movdqa xmm1, xmm0 | 742 movdqa xmm1, xmm0 |
| 743 movdqa xmm3, xmm2 | 743 movdqa xmm3, xmm2 |
| 744 psllw xmm1, 4 | 744 psllw xmm1, 4 |
| 745 psrlw xmm3, 4 | 745 psrlw xmm3, 4 |
| 746 por xmm0, xmm1 | 746 por xmm0, xmm1 |
| 747 por xmm2, xmm3 | 747 por xmm2, xmm3 |
| 748 movdqa xmm1, xmm0 | 748 movdqa xmm1, xmm0 |
| 749 punpcklbw xmm0, xmm2 | 749 punpcklbw xmm0, xmm2 |
| 750 punpckhbw xmm1, xmm2 | 750 punpckhbw xmm1, xmm2 |
| 751 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB | 751 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
| 752 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB | 752 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
| 753 lea eax, [eax + 16] | 753 lea eax, [eax + 16] |
| 754 sub ecx, 8 | 754 sub ecx, 8 |
| 755 jg convertloop | 755 jg convertloop |
| 756 ret | 756 ret |
| 757 } | 757 } |
| 758 } | 758 } |
| 759 | 759 |
| 760 __declspec(naked) | 760 __declspec(naked) |
| 761 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 761 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 762 __asm { | 762 __asm { |
| 763 mov eax, [esp + 4] // src_argb | 763 mov eax, [esp + 4] // src_argb |
| 764 mov edx, [esp + 8] // dst_rgb | 764 mov edx, [esp + 8] // dst_rgb |
| 765 mov ecx, [esp + 12] // pix | 765 mov ecx, [esp + 12] // width |
| 766 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 | 766 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
| 767 | 767 |
| 768 convertloop: | 768 convertloop: |
| 769 movdqu xmm0, [eax] // fetch 16 pixels of argb | 769 movdqu xmm0, [eax] // fetch 16 pixels of argb |
| 770 movdqu xmm1, [eax + 16] | 770 movdqu xmm1, [eax + 16] |
| 771 movdqu xmm2, [eax + 32] | 771 movdqu xmm2, [eax + 32] |
| 772 movdqu xmm3, [eax + 48] | 772 movdqu xmm3, [eax + 48] |
| 773 lea eax, [eax + 64] | 773 lea eax, [eax + 64] |
| 774 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB | 774 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| 775 pshufb xmm1, xmm6 | 775 pshufb xmm1, xmm6 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 789 movdqu [edx + 16], xmm1 // store 1 | 789 movdqu [edx + 16], xmm1 // store 1 |
| 790 movdqu [edx + 32], xmm2 // store 2 | 790 movdqu [edx + 32], xmm2 // store 2 |
| 791 lea edx, [edx + 48] | 791 lea edx, [edx + 48] |
| 792 sub ecx, 16 | 792 sub ecx, 16 |
| 793 jg convertloop | 793 jg convertloop |
| 794 ret | 794 ret |
| 795 } | 795 } |
| 796 } | 796 } |
| 797 | 797 |
| 798 __declspec(naked) | 798 __declspec(naked) |
| 799 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { | 799 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 800 __asm { | 800 __asm { |
| 801 mov eax, [esp + 4] // src_argb | 801 mov eax, [esp + 4] // src_argb |
| 802 mov edx, [esp + 8] // dst_rgb | 802 mov edx, [esp + 8] // dst_rgb |
| 803 mov ecx, [esp + 12] // pix | 803 mov ecx, [esp + 12] // width |
| 804 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW | 804 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
| 805 | 805 |
| 806 convertloop: | 806 convertloop: |
| 807 movdqu xmm0, [eax] // fetch 16 pixels of argb | 807 movdqu xmm0, [eax] // fetch 16 pixels of argb |
| 808 movdqu xmm1, [eax + 16] | 808 movdqu xmm1, [eax + 16] |
| 809 movdqu xmm2, [eax + 32] | 809 movdqu xmm2, [eax + 32] |
| 810 movdqu xmm3, [eax + 48] | 810 movdqu xmm3, [eax + 48] |
| 811 lea eax, [eax + 64] | 811 lea eax, [eax + 64] |
| 812 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB | 812 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| 813 pshufb xmm1, xmm6 | 813 pshufb xmm1, xmm6 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 827 movdqu [edx + 16], xmm1 // store 1 | 827 movdqu [edx + 16], xmm1 // store 1 |
| 828 movdqu [edx + 32], xmm2 // store 2 | 828 movdqu [edx + 32], xmm2 // store 2 |
| 829 lea edx, [edx + 48] | 829 lea edx, [edx + 48] |
| 830 sub ecx, 16 | 830 sub ecx, 16 |
| 831 jg convertloop | 831 jg convertloop |
| 832 ret | 832 ret |
| 833 } | 833 } |
| 834 } | 834 } |
| 835 | 835 |
| 836 __declspec(naked) | 836 __declspec(naked) |
| 837 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 837 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 838 __asm { | 838 __asm { |
| 839 mov eax, [esp + 4] // src_argb | 839 mov eax, [esp + 4] // src_argb |
| 840 mov edx, [esp + 8] // dst_rgb | 840 mov edx, [esp + 8] // dst_rgb |
| 841 mov ecx, [esp + 12] // pix | 841 mov ecx, [esp + 12] // width |
| 842 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f | 842 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| 843 psrld xmm3, 27 | 843 psrld xmm3, 27 |
| 844 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 | 844 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| 845 psrld xmm4, 26 | 845 psrld xmm4, 26 |
| 846 pslld xmm4, 5 | 846 pslld xmm4, 5 |
| 847 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 | 847 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| 848 pslld xmm5, 11 | 848 pslld xmm5, 11 |
| 849 | 849 |
| 850 convertloop: | 850 convertloop: |
| 851 movdqu xmm0, [eax] // fetch 4 pixels of argb | 851 movdqu xmm0, [eax] // fetch 4 pixels of argb |
| (...skipping 13 matching lines...) Expand all Loading... |
| 865 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 | 865 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| 866 lea edx, [edx + 8] | 866 lea edx, [edx + 8] |
| 867 sub ecx, 4 | 867 sub ecx, 4 |
| 868 jg convertloop | 868 jg convertloop |
| 869 ret | 869 ret |
| 870 } | 870 } |
| 871 } | 871 } |
| 872 | 872 |
| 873 __declspec(naked) | 873 __declspec(naked) |
| 874 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, | 874 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, |
| 875 const uint32 dither4, int pix) { | 875 const uint32 dither4, int width) { |
| 876 __asm { | 876 __asm { |
| 877 | 877 |
| 878 mov eax, [esp + 4] // src_argb | 878 mov eax, [esp + 4] // src_argb |
| 879 mov edx, [esp + 8] // dst_rgb | 879 mov edx, [esp + 8] // dst_rgb |
| 880 movd xmm6, [esp + 12] // dither4 | 880 movd xmm6, [esp + 12] // dither4 |
| 881 mov ecx, [esp + 16] // pix | 881 mov ecx, [esp + 16] // width |
| 882 punpcklbw xmm6, xmm6 // make dither 16 bytes | 882 punpcklbw xmm6, xmm6 // make dither 16 bytes |
| 883 movdqa xmm7, xmm6 | 883 movdqa xmm7, xmm6 |
| 884 punpcklwd xmm6, xmm6 | 884 punpcklwd xmm6, xmm6 |
| 885 punpckhwd xmm7, xmm7 | 885 punpckhwd xmm7, xmm7 |
| 886 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f | 886 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| 887 psrld xmm3, 27 | 887 psrld xmm3, 27 |
| 888 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 | 888 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| 889 psrld xmm4, 26 | 889 psrld xmm4, 26 |
| 890 pslld xmm4, 5 | 890 pslld xmm4, 5 |
| 891 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 | 891 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 911 lea edx, [edx + 8] | 911 lea edx, [edx + 8] |
| 912 sub ecx, 4 | 912 sub ecx, 4 |
| 913 jg convertloop | 913 jg convertloop |
| 914 ret | 914 ret |
| 915 } | 915 } |
| 916 } | 916 } |
| 917 | 917 |
| 918 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 | 918 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
| 919 __declspec(naked) | 919 __declspec(naked) |
| 920 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, | 920 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, |
| 921 const uint32 dither4, int pix) { | 921 const uint32 dither4, int width) { |
| 922 __asm { | 922 __asm { |
| 923 mov eax, [esp + 4] // src_argb | 923 mov eax, [esp + 4] // src_argb |
| 924 mov edx, [esp + 8] // dst_rgb | 924 mov edx, [esp + 8] // dst_rgb |
| 925 vbroadcastss xmm6, [esp + 12] // dither4 | 925 vbroadcastss xmm6, [esp + 12] // dither4 |
| 926 mov ecx, [esp + 16] // pix | 926 mov ecx, [esp + 16] // width |
| 927 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes | 927 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes |
| 928 vpermq ymm6, ymm6, 0xd8 | 928 vpermq ymm6, ymm6, 0xd8 |
| 929 vpunpcklwd ymm6, ymm6, ymm6 | 929 vpunpcklwd ymm6, ymm6, ymm6 |
| 930 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f | 930 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| 931 vpsrld ymm3, ymm3, 27 | 931 vpsrld ymm3, ymm3, 27 |
| 932 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 | 932 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| 933 vpsrld ymm4, ymm4, 26 | 933 vpsrld ymm4, ymm4, 26 |
| 934 vpslld ymm4, ymm4, 5 | 934 vpslld ymm4, ymm4, 5 |
| 935 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 | 935 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| 936 | 936 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 953 sub ecx, 8 | 953 sub ecx, 8 |
| 954 jg convertloop | 954 jg convertloop |
| 955 vzeroupper | 955 vzeroupper |
| 956 ret | 956 ret |
| 957 } | 957 } |
| 958 } | 958 } |
| 959 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 | 959 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
| 960 | 960 |
| 961 // TODO(fbarchard): Improve sign extension/packing. | 961 // TODO(fbarchard): Improve sign extension/packing. |
| 962 __declspec(naked) | 962 __declspec(naked) |
| 963 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 963 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 964 __asm { | 964 __asm { |
| 965 mov eax, [esp + 4] // src_argb | 965 mov eax, [esp + 4] // src_argb |
| 966 mov edx, [esp + 8] // dst_rgb | 966 mov edx, [esp + 8] // dst_rgb |
| 967 mov ecx, [esp + 12] // pix | 967 mov ecx, [esp + 12] // width |
| 968 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f | 968 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
| 969 psrld xmm4, 27 | 969 psrld xmm4, 27 |
| 970 movdqa xmm5, xmm4 // generate mask 0x000003e0 | 970 movdqa xmm5, xmm4 // generate mask 0x000003e0 |
| 971 pslld xmm5, 5 | 971 pslld xmm5, 5 |
| 972 movdqa xmm6, xmm4 // generate mask 0x00007c00 | 972 movdqa xmm6, xmm4 // generate mask 0x00007c00 |
| 973 pslld xmm6, 10 | 973 pslld xmm6, 10 |
| 974 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 | 974 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
| 975 pslld xmm7, 15 | 975 pslld xmm7, 15 |
| 976 | 976 |
| 977 convertloop: | 977 convertloop: |
| (...skipping 16 matching lines...) Expand all Loading... |
| 994 lea eax, [eax + 16] | 994 lea eax, [eax + 16] |
| 995 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 | 995 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
| 996 lea edx, [edx + 8] | 996 lea edx, [edx + 8] |
| 997 sub ecx, 4 | 997 sub ecx, 4 |
| 998 jg convertloop | 998 jg convertloop |
| 999 ret | 999 ret |
| 1000 } | 1000 } |
| 1001 } | 1001 } |
| 1002 | 1002 |
| 1003 __declspec(naked) | 1003 __declspec(naked) |
| 1004 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1004 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 1005 __asm { | 1005 __asm { |
| 1006 mov eax, [esp + 4] // src_argb | 1006 mov eax, [esp + 4] // src_argb |
| 1007 mov edx, [esp + 8] // dst_rgb | 1007 mov edx, [esp + 8] // dst_rgb |
| 1008 mov ecx, [esp + 12] // pix | 1008 mov ecx, [esp + 12] // width |
| 1009 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 | 1009 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
| 1010 psllw xmm4, 12 | 1010 psllw xmm4, 12 |
| 1011 movdqa xmm3, xmm4 // generate mask 0x00f000f0 | 1011 movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
| 1012 psrlw xmm3, 8 | 1012 psrlw xmm3, 8 |
| 1013 | 1013 |
| 1014 convertloop: | 1014 convertloop: |
| 1015 movdqu xmm0, [eax] // fetch 4 pixels of argb | 1015 movdqu xmm0, [eax] // fetch 4 pixels of argb |
| 1016 movdqa xmm1, xmm0 | 1016 movdqa xmm1, xmm0 |
| 1017 pand xmm0, xmm3 // low nibble | 1017 pand xmm0, xmm3 // low nibble |
| 1018 pand xmm1, xmm4 // high nibble | 1018 pand xmm1, xmm4 // high nibble |
| 1019 psrld xmm0, 4 | 1019 psrld xmm0, 4 |
| 1020 psrld xmm1, 8 | 1020 psrld xmm1, 8 |
| 1021 por xmm0, xmm1 | 1021 por xmm0, xmm1 |
| 1022 packuswb xmm0, xmm0 | 1022 packuswb xmm0, xmm0 |
| 1023 lea eax, [eax + 16] | 1023 lea eax, [eax + 16] |
| 1024 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 | 1024 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
| 1025 lea edx, [edx + 8] | 1025 lea edx, [edx + 8] |
| 1026 sub ecx, 4 | 1026 sub ecx, 4 |
| 1027 jg convertloop | 1027 jg convertloop |
| 1028 ret | 1028 ret |
| 1029 } | 1029 } |
| 1030 } | 1030 } |
| 1031 | 1031 |
| 1032 #ifdef HAS_ARGBTORGB565ROW_AVX2 | 1032 #ifdef HAS_ARGBTORGB565ROW_AVX2 |
| 1033 __declspec(naked) | 1033 __declspec(naked) |
| 1034 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1034 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 1035 __asm { | 1035 __asm { |
| 1036 mov eax, [esp + 4] // src_argb | 1036 mov eax, [esp + 4] // src_argb |
| 1037 mov edx, [esp + 8] // dst_rgb | 1037 mov edx, [esp + 8] // dst_rgb |
| 1038 mov ecx, [esp + 12] // pix | 1038 mov ecx, [esp + 12] // width |
| 1039 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f | 1039 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f |
| 1040 vpsrld ymm3, ymm3, 27 | 1040 vpsrld ymm3, ymm3, 27 |
| 1041 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 | 1041 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 |
| 1042 vpsrld ymm4, ymm4, 26 | 1042 vpsrld ymm4, ymm4, 26 |
| 1043 vpslld ymm4, ymm4, 5 | 1043 vpslld ymm4, ymm4, 5 |
| 1044 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 | 1044 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 |
| 1045 | 1045 |
| 1046 convertloop: | 1046 convertloop: |
| 1047 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1047 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| 1048 vpsrld ymm2, ymm0, 5 // G | 1048 vpsrld ymm2, ymm0, 5 // G |
| (...skipping 12 matching lines...) Expand all Loading... |
| 1061 sub ecx, 8 | 1061 sub ecx, 8 |
| 1062 jg convertloop | 1062 jg convertloop |
| 1063 vzeroupper | 1063 vzeroupper |
| 1064 ret | 1064 ret |
| 1065 } | 1065 } |
| 1066 } | 1066 } |
| 1067 #endif // HAS_ARGBTORGB565ROW_AVX2 | 1067 #endif // HAS_ARGBTORGB565ROW_AVX2 |
| 1068 | 1068 |
| 1069 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 | 1069 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 |
| 1070 __declspec(naked) | 1070 __declspec(naked) |
| 1071 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1071 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 1072 __asm { | 1072 __asm { |
| 1073 mov eax, [esp + 4] // src_argb | 1073 mov eax, [esp + 4] // src_argb |
| 1074 mov edx, [esp + 8] // dst_rgb | 1074 mov edx, [esp + 8] // dst_rgb |
| 1075 mov ecx, [esp + 12] // pix | 1075 mov ecx, [esp + 12] // width |
| 1076 vpcmpeqb ymm4, ymm4, ymm4 | 1076 vpcmpeqb ymm4, ymm4, ymm4 |
| 1077 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f | 1077 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f |
| 1078 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 | 1078 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 |
| 1079 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 | 1079 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 |
| 1080 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 | 1080 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 |
| 1081 vpslld ymm7, ymm7, 15 | 1081 vpslld ymm7, ymm7, 15 |
| 1082 | 1082 |
| 1083 convertloop: | 1083 convertloop: |
| 1084 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1084 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| 1085 vpsrld ymm3, ymm0, 9 // R | 1085 vpsrld ymm3, ymm0, 9 // R |
| (...skipping 15 matching lines...) Expand all Loading... |
| 1101 sub ecx, 8 | 1101 sub ecx, 8 |
| 1102 jg convertloop | 1102 jg convertloop |
| 1103 vzeroupper | 1103 vzeroupper |
| 1104 ret | 1104 ret |
| 1105 } | 1105 } |
| 1106 } | 1106 } |
| 1107 #endif // HAS_ARGBTOARGB1555ROW_AVX2 | 1107 #endif // HAS_ARGBTOARGB1555ROW_AVX2 |
| 1108 | 1108 |
| 1109 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 | 1109 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 |
| 1110 __declspec(naked) | 1110 __declspec(naked) |
| 1111 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { | 1111 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { |
| 1112 __asm { | 1112 __asm { |
| 1113 mov eax, [esp + 4] // src_argb | 1113 mov eax, [esp + 4] // src_argb |
| 1114 mov edx, [esp + 8] // dst_rgb | 1114 mov edx, [esp + 8] // dst_rgb |
| 1115 mov ecx, [esp + 12] // pix | 1115 mov ecx, [esp + 12] // width |
| 1116 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 | 1116 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 |
| 1117 vpsllw ymm4, ymm4, 12 | 1117 vpsllw ymm4, ymm4, 12 |
| 1118 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 | 1118 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 |
| 1119 | 1119 |
| 1120 convertloop: | 1120 convertloop: |
| 1121 vmovdqu ymm0, [eax] // fetch 8 pixels of argb | 1121 vmovdqu ymm0, [eax] // fetch 8 pixels of argb |
| 1122 vpand ymm1, ymm0, ymm4 // high nibble | 1122 vpand ymm1, ymm0, ymm4 // high nibble |
| 1123 vpand ymm0, ymm0, ymm3 // low nibble | 1123 vpand ymm0, ymm0, ymm3 // low nibble |
| 1124 vpsrld ymm1, ymm1, 8 | 1124 vpsrld ymm1, ymm1, 8 |
| 1125 vpsrld ymm0, ymm0, 4 | 1125 vpsrld ymm0, ymm0, 4 |
| 1126 vpor ymm0, ymm0, ymm1 | 1126 vpor ymm0, ymm0, ymm1 |
| 1127 vpackuswb ymm0, ymm0, ymm0 | 1127 vpackuswb ymm0, ymm0, ymm0 |
| 1128 vpermq ymm0, ymm0, 0xd8 | 1128 vpermq ymm0, ymm0, 0xd8 |
| 1129 lea eax, [eax + 32] | 1129 lea eax, [eax + 32] |
| 1130 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 | 1130 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 |
| 1131 lea edx, [edx + 16] | 1131 lea edx, [edx + 16] |
| 1132 sub ecx, 8 | 1132 sub ecx, 8 |
| 1133 jg convertloop | 1133 jg convertloop |
| 1134 vzeroupper | 1134 vzeroupper |
| 1135 ret | 1135 ret |
| 1136 } | 1136 } |
| 1137 } | 1137 } |
| 1138 #endif // HAS_ARGBTOARGB4444ROW_AVX2 | 1138 #endif // HAS_ARGBTOARGB4444ROW_AVX2 |
| 1139 | 1139 |
| 1140 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | 1140 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
| 1141 __declspec(naked) | 1141 __declspec(naked) |
| 1142 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1142 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
| 1143 __asm { | 1143 __asm { |
| 1144 mov eax, [esp + 4] /* src_argb */ | 1144 mov eax, [esp + 4] /* src_argb */ |
| 1145 mov edx, [esp + 8] /* dst_y */ | 1145 mov edx, [esp + 8] /* dst_y */ |
| 1146 mov ecx, [esp + 12] /* pix */ | 1146 mov ecx, [esp + 12] /* width */ |
| 1147 movdqa xmm4, xmmword ptr kARGBToY | 1147 movdqa xmm4, xmmword ptr kARGBToY |
| 1148 movdqa xmm5, xmmword ptr kAddY16 | 1148 movdqa xmm5, xmmword ptr kAddY16 |
| 1149 | 1149 |
| 1150 convertloop: | 1150 convertloop: |
| 1151 movdqu xmm0, [eax] | 1151 movdqu xmm0, [eax] |
| 1152 movdqu xmm1, [eax + 16] | 1152 movdqu xmm1, [eax + 16] |
| 1153 movdqu xmm2, [eax + 32] | 1153 movdqu xmm2, [eax + 32] |
| 1154 movdqu xmm3, [eax + 48] | 1154 movdqu xmm3, [eax + 48] |
| 1155 pmaddubsw xmm0, xmm4 | 1155 pmaddubsw xmm0, xmm4 |
| 1156 pmaddubsw xmm1, xmm4 | 1156 pmaddubsw xmm1, xmm4 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1167 lea edx, [edx + 16] | 1167 lea edx, [edx + 16] |
| 1168 sub ecx, 16 | 1168 sub ecx, 16 |
| 1169 jg convertloop | 1169 jg convertloop |
| 1170 ret | 1170 ret |
| 1171 } | 1171 } |
| 1172 } | 1172 } |
| 1173 | 1173 |
| 1174 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | 1174 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. |
| 1175 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | 1175 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. |
| 1176 __declspec(naked) | 1176 __declspec(naked) |
| 1177 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1177 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
| 1178 __asm { | 1178 __asm { |
| 1179 mov eax, [esp + 4] /* src_argb */ | 1179 mov eax, [esp + 4] /* src_argb */ |
| 1180 mov edx, [esp + 8] /* dst_y */ | 1180 mov edx, [esp + 8] /* dst_y */ |
| 1181 mov ecx, [esp + 12] /* pix */ | 1181 mov ecx, [esp + 12] /* width */ |
| 1182 movdqa xmm4, xmmword ptr kARGBToYJ | 1182 movdqa xmm4, xmmword ptr kARGBToYJ |
| 1183 movdqa xmm5, xmmword ptr kAddYJ64 | 1183 movdqa xmm5, xmmword ptr kAddYJ64 |
| 1184 | 1184 |
| 1185 convertloop: | 1185 convertloop: |
| 1186 movdqu xmm0, [eax] | 1186 movdqu xmm0, [eax] |
| 1187 movdqu xmm1, [eax + 16] | 1187 movdqu xmm1, [eax + 16] |
| 1188 movdqu xmm2, [eax + 32] | 1188 movdqu xmm2, [eax + 32] |
| 1189 movdqu xmm3, [eax + 48] | 1189 movdqu xmm3, [eax + 48] |
| 1190 pmaddubsw xmm0, xmm4 | 1190 pmaddubsw xmm0, xmm4 |
| 1191 pmaddubsw xmm1, xmm4 | 1191 pmaddubsw xmm1, xmm4 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 1208 } | 1208 } |
| 1209 | 1209 |
| 1210 #ifdef HAS_ARGBTOYROW_AVX2 | 1210 #ifdef HAS_ARGBTOYROW_AVX2 |
| 1211 // vpermd for vphaddw + vpackuswb vpermd. | 1211 // vpermd for vphaddw + vpackuswb vpermd. |
| 1212 static const lvec32 kPermdARGBToY_AVX = { | 1212 static const lvec32 kPermdARGBToY_AVX = { |
| 1213 0, 4, 1, 5, 2, 6, 3, 7 | 1213 0, 4, 1, 5, 2, 6, 3, 7 |
| 1214 }; | 1214 }; |
| 1215 | 1215 |
| 1216 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1216 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 1217 __declspec(naked) | 1217 __declspec(naked) |
| 1218 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1218 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
| 1219 __asm { | 1219 __asm { |
| 1220 mov eax, [esp + 4] /* src_argb */ | 1220 mov eax, [esp + 4] /* src_argb */ |
| 1221 mov edx, [esp + 8] /* dst_y */ | 1221 mov edx, [esp + 8] /* dst_y */ |
| 1222 mov ecx, [esp + 12] /* pix */ | 1222 mov ecx, [esp + 12] /* width */ |
| 1223 vbroadcastf128 ymm4, xmmword ptr kARGBToY | 1223 vbroadcastf128 ymm4, xmmword ptr kARGBToY |
| 1224 vbroadcastf128 ymm5, xmmword ptr kAddY16 | 1224 vbroadcastf128 ymm5, xmmword ptr kAddY16 |
| 1225 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX | 1225 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX |
| 1226 | 1226 |
| 1227 convertloop: | 1227 convertloop: |
| 1228 vmovdqu ymm0, [eax] | 1228 vmovdqu ymm0, [eax] |
| 1229 vmovdqu ymm1, [eax + 32] | 1229 vmovdqu ymm1, [eax + 32] |
| 1230 vmovdqu ymm2, [eax + 64] | 1230 vmovdqu ymm2, [eax + 64] |
| 1231 vmovdqu ymm3, [eax + 96] | 1231 vmovdqu ymm3, [eax + 96] |
| 1232 vpmaddubsw ymm0, ymm0, ymm4 | 1232 vpmaddubsw ymm0, ymm0, ymm4 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 1247 jg convertloop | 1247 jg convertloop |
| 1248 vzeroupper | 1248 vzeroupper |
| 1249 ret | 1249 ret |
| 1250 } | 1250 } |
| 1251 } | 1251 } |
| 1252 #endif // HAS_ARGBTOYROW_AVX2 | 1252 #endif // HAS_ARGBTOYROW_AVX2 |
| 1253 | 1253 |
| 1254 #ifdef HAS_ARGBTOYJROW_AVX2 | 1254 #ifdef HAS_ARGBTOYJROW_AVX2 |
| 1255 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 1255 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 1256 __declspec(naked) | 1256 __declspec(naked) |
| 1257 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | 1257 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
| 1258 __asm { | 1258 __asm { |
| 1259 mov eax, [esp + 4] /* src_argb */ | 1259 mov eax, [esp + 4] /* src_argb */ |
| 1260 mov edx, [esp + 8] /* dst_y */ | 1260 mov edx, [esp + 8] /* dst_y */ |
| 1261 mov ecx, [esp + 12] /* pix */ | 1261 mov ecx, [esp + 12] /* width */ |
| 1262 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ | 1262 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ |
| 1263 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 | 1263 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 |
| 1264 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX | 1264 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX |
| 1265 | 1265 |
| 1266 convertloop: | 1266 convertloop: |
| 1267 vmovdqu ymm0, [eax] | 1267 vmovdqu ymm0, [eax] |
| 1268 vmovdqu ymm1, [eax + 32] | 1268 vmovdqu ymm1, [eax + 32] |
| 1269 vmovdqu ymm2, [eax + 64] | 1269 vmovdqu ymm2, [eax + 64] |
| 1270 vmovdqu ymm3, [eax + 96] | 1270 vmovdqu ymm3, [eax + 96] |
| 1271 vpmaddubsw ymm0, ymm0, ymm4 | 1271 vpmaddubsw ymm0, ymm0, ymm4 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 1286 sub ecx, 32 | 1286 sub ecx, 32 |
| 1287 jg convertloop | 1287 jg convertloop |
| 1288 | 1288 |
| 1289 vzeroupper | 1289 vzeroupper |
| 1290 ret | 1290 ret |
| 1291 } | 1291 } |
| 1292 } | 1292 } |
| 1293 #endif // HAS_ARGBTOYJROW_AVX2 | 1293 #endif // HAS_ARGBTOYJROW_AVX2 |
| 1294 | 1294 |
| 1295 __declspec(naked) | 1295 __declspec(naked) |
| 1296 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1296 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
| 1297 __asm { | 1297 __asm { |
| 1298 mov eax, [esp + 4] /* src_argb */ | 1298 mov eax, [esp + 4] /* src_argb */ |
| 1299 mov edx, [esp + 8] /* dst_y */ | 1299 mov edx, [esp + 8] /* dst_y */ |
| 1300 mov ecx, [esp + 12] /* pix */ | 1300 mov ecx, [esp + 12] /* width */ |
| 1301 movdqa xmm4, xmmword ptr kBGRAToY | 1301 movdqa xmm4, xmmword ptr kBGRAToY |
| 1302 movdqa xmm5, xmmword ptr kAddY16 | 1302 movdqa xmm5, xmmword ptr kAddY16 |
| 1303 | 1303 |
| 1304 convertloop: | 1304 convertloop: |
| 1305 movdqu xmm0, [eax] | 1305 movdqu xmm0, [eax] |
| 1306 movdqu xmm1, [eax + 16] | 1306 movdqu xmm1, [eax + 16] |
| 1307 movdqu xmm2, [eax + 32] | 1307 movdqu xmm2, [eax + 32] |
| 1308 movdqu xmm3, [eax + 48] | 1308 movdqu xmm3, [eax + 48] |
| 1309 pmaddubsw xmm0, xmm4 | 1309 pmaddubsw xmm0, xmm4 |
| 1310 pmaddubsw xmm1, xmm4 | 1310 pmaddubsw xmm1, xmm4 |
| 1311 pmaddubsw xmm2, xmm4 | 1311 pmaddubsw xmm2, xmm4 |
| 1312 pmaddubsw xmm3, xmm4 | 1312 pmaddubsw xmm3, xmm4 |
| 1313 lea eax, [eax + 64] | 1313 lea eax, [eax + 64] |
| 1314 phaddw xmm0, xmm1 | 1314 phaddw xmm0, xmm1 |
| 1315 phaddw xmm2, xmm3 | 1315 phaddw xmm2, xmm3 |
| 1316 psrlw xmm0, 7 | 1316 psrlw xmm0, 7 |
| 1317 psrlw xmm2, 7 | 1317 psrlw xmm2, 7 |
| 1318 packuswb xmm0, xmm2 | 1318 packuswb xmm0, xmm2 |
| 1319 paddb xmm0, xmm5 | 1319 paddb xmm0, xmm5 |
| 1320 movdqu [edx], xmm0 | 1320 movdqu [edx], xmm0 |
| 1321 lea edx, [edx + 16] | 1321 lea edx, [edx + 16] |
| 1322 sub ecx, 16 | 1322 sub ecx, 16 |
| 1323 jg convertloop | 1323 jg convertloop |
| 1324 ret | 1324 ret |
| 1325 } | 1325 } |
| 1326 } | 1326 } |
| 1327 | 1327 |
| 1328 __declspec(naked) | 1328 __declspec(naked) |
| 1329 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1329 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
| 1330 __asm { | 1330 __asm { |
| 1331 mov eax, [esp + 4] /* src_argb */ | 1331 mov eax, [esp + 4] /* src_argb */ |
| 1332 mov edx, [esp + 8] /* dst_y */ | 1332 mov edx, [esp + 8] /* dst_y */ |
| 1333 mov ecx, [esp + 12] /* pix */ | 1333 mov ecx, [esp + 12] /* width */ |
| 1334 movdqa xmm4, xmmword ptr kABGRToY | 1334 movdqa xmm4, xmmword ptr kABGRToY |
| 1335 movdqa xmm5, xmmword ptr kAddY16 | 1335 movdqa xmm5, xmmword ptr kAddY16 |
| 1336 | 1336 |
| 1337 convertloop: | 1337 convertloop: |
| 1338 movdqu xmm0, [eax] | 1338 movdqu xmm0, [eax] |
| 1339 movdqu xmm1, [eax + 16] | 1339 movdqu xmm1, [eax + 16] |
| 1340 movdqu xmm2, [eax + 32] | 1340 movdqu xmm2, [eax + 32] |
| 1341 movdqu xmm3, [eax + 48] | 1341 movdqu xmm3, [eax + 48] |
| 1342 pmaddubsw xmm0, xmm4 | 1342 pmaddubsw xmm0, xmm4 |
| 1343 pmaddubsw xmm1, xmm4 | 1343 pmaddubsw xmm1, xmm4 |
| 1344 pmaddubsw xmm2, xmm4 | 1344 pmaddubsw xmm2, xmm4 |
| 1345 pmaddubsw xmm3, xmm4 | 1345 pmaddubsw xmm3, xmm4 |
| 1346 lea eax, [eax + 64] | 1346 lea eax, [eax + 64] |
| 1347 phaddw xmm0, xmm1 | 1347 phaddw xmm0, xmm1 |
| 1348 phaddw xmm2, xmm3 | 1348 phaddw xmm2, xmm3 |
| 1349 psrlw xmm0, 7 | 1349 psrlw xmm0, 7 |
| 1350 psrlw xmm2, 7 | 1350 psrlw xmm2, 7 |
| 1351 packuswb xmm0, xmm2 | 1351 packuswb xmm0, xmm2 |
| 1352 paddb xmm0, xmm5 | 1352 paddb xmm0, xmm5 |
| 1353 movdqu [edx], xmm0 | 1353 movdqu [edx], xmm0 |
| 1354 lea edx, [edx + 16] | 1354 lea edx, [edx + 16] |
| 1355 sub ecx, 16 | 1355 sub ecx, 16 |
| 1356 jg convertloop | 1356 jg convertloop |
| 1357 ret | 1357 ret |
| 1358 } | 1358 } |
| 1359 } | 1359 } |
| 1360 | 1360 |
| 1361 __declspec(naked) | 1361 __declspec(naked) |
| 1362 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | 1362 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { |
| 1363 __asm { | 1363 __asm { |
| 1364 mov eax, [esp + 4] /* src_argb */ | 1364 mov eax, [esp + 4] /* src_argb */ |
| 1365 mov edx, [esp + 8] /* dst_y */ | 1365 mov edx, [esp + 8] /* dst_y */ |
| 1366 mov ecx, [esp + 12] /* pix */ | 1366 mov ecx, [esp + 12] /* width */ |
| 1367 movdqa xmm4, xmmword ptr kRGBAToY | 1367 movdqa xmm4, xmmword ptr kRGBAToY |
| 1368 movdqa xmm5, xmmword ptr kAddY16 | 1368 movdqa xmm5, xmmword ptr kAddY16 |
| 1369 | 1369 |
| 1370 convertloop: | 1370 convertloop: |
| 1371 movdqu xmm0, [eax] | 1371 movdqu xmm0, [eax] |
| 1372 movdqu xmm1, [eax + 16] | 1372 movdqu xmm1, [eax + 16] |
| 1373 movdqu xmm2, [eax + 32] | 1373 movdqu xmm2, [eax + 32] |
| 1374 movdqu xmm3, [eax + 48] | 1374 movdqu xmm3, [eax + 48] |
| 1375 pmaddubsw xmm0, xmm4 | 1375 pmaddubsw xmm0, xmm4 |
| 1376 pmaddubsw xmm1, xmm4 | 1376 pmaddubsw xmm1, xmm4 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 1394 __declspec(naked) | 1394 __declspec(naked) |
| 1395 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1395 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1396 uint8* dst_u, uint8* dst_v, int width) { | 1396 uint8* dst_u, uint8* dst_v, int width) { |
| 1397 __asm { | 1397 __asm { |
| 1398 push esi | 1398 push esi |
| 1399 push edi | 1399 push edi |
| 1400 mov eax, [esp + 8 + 4] // src_argb | 1400 mov eax, [esp + 8 + 4] // src_argb |
| 1401 mov esi, [esp + 8 + 8] // src_stride_argb | 1401 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1402 mov edx, [esp + 8 + 12] // dst_u | 1402 mov edx, [esp + 8 + 12] // dst_u |
| 1403 mov edi, [esp + 8 + 16] // dst_v | 1403 mov edi, [esp + 8 + 16] // dst_v |
| 1404 mov ecx, [esp + 8 + 20] // pix | 1404 mov ecx, [esp + 8 + 20] // width |
| 1405 movdqa xmm5, xmmword ptr kAddUV128 | 1405 movdqa xmm5, xmmword ptr kAddUV128 |
| 1406 movdqa xmm6, xmmword ptr kARGBToV | 1406 movdqa xmm6, xmmword ptr kARGBToV |
| 1407 movdqa xmm7, xmmword ptr kARGBToU | 1407 movdqa xmm7, xmmword ptr kARGBToU |
| 1408 sub edi, edx // stride from u to v | 1408 sub edi, edx // stride from u to v |
| 1409 | 1409 |
| 1410 convertloop: | 1410 convertloop: |
| 1411 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1411 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1412 movdqu xmm0, [eax] | 1412 movdqu xmm0, [eax] |
| 1413 movdqu xmm4, [eax + esi] | 1413 movdqu xmm4, [eax + esi] |
| 1414 pavgb xmm0, xmm4 | 1414 pavgb xmm0, xmm4 |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1464 __declspec(naked) | 1464 __declspec(naked) |
| 1465 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1465 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1466 uint8* dst_u, uint8* dst_v, int width) { | 1466 uint8* dst_u, uint8* dst_v, int width) { |
| 1467 __asm { | 1467 __asm { |
| 1468 push esi | 1468 push esi |
| 1469 push edi | 1469 push edi |
| 1470 mov eax, [esp + 8 + 4] // src_argb | 1470 mov eax, [esp + 8 + 4] // src_argb |
| 1471 mov esi, [esp + 8 + 8] // src_stride_argb | 1471 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1472 mov edx, [esp + 8 + 12] // dst_u | 1472 mov edx, [esp + 8 + 12] // dst_u |
| 1473 mov edi, [esp + 8 + 16] // dst_v | 1473 mov edi, [esp + 8 + 16] // dst_v |
| 1474 mov ecx, [esp + 8 + 20] // pix | 1474 mov ecx, [esp + 8 + 20] // width |
| 1475 movdqa xmm5, xmmword ptr kAddUVJ128 | 1475 movdqa xmm5, xmmword ptr kAddUVJ128 |
| 1476 movdqa xmm6, xmmword ptr kARGBToVJ | 1476 movdqa xmm6, xmmword ptr kARGBToVJ |
| 1477 movdqa xmm7, xmmword ptr kARGBToUJ | 1477 movdqa xmm7, xmmword ptr kARGBToUJ |
| 1478 sub edi, edx // stride from u to v | 1478 sub edi, edx // stride from u to v |
| 1479 | 1479 |
| 1480 convertloop: | 1480 convertloop: |
| 1481 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1481 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1482 movdqu xmm0, [eax] | 1482 movdqu xmm0, [eax] |
| 1483 movdqu xmm4, [eax + esi] | 1483 movdqu xmm4, [eax + esi] |
| 1484 pavgb xmm0, xmm4 | 1484 pavgb xmm0, xmm4 |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1536 __declspec(naked) | 1536 __declspec(naked) |
| 1537 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 1537 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
| 1538 uint8* dst_u, uint8* dst_v, int width) { | 1538 uint8* dst_u, uint8* dst_v, int width) { |
| 1539 __asm { | 1539 __asm { |
| 1540 push esi | 1540 push esi |
| 1541 push edi | 1541 push edi |
| 1542 mov eax, [esp + 8 + 4] // src_argb | 1542 mov eax, [esp + 8 + 4] // src_argb |
| 1543 mov esi, [esp + 8 + 8] // src_stride_argb | 1543 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1544 mov edx, [esp + 8 + 12] // dst_u | 1544 mov edx, [esp + 8 + 12] // dst_u |
| 1545 mov edi, [esp + 8 + 16] // dst_v | 1545 mov edi, [esp + 8 + 16] // dst_v |
| 1546 mov ecx, [esp + 8 + 20] // pix | 1546 mov ecx, [esp + 8 + 20] // width |
| 1547 vbroadcastf128 ymm5, xmmword ptr kAddUV128 | 1547 vbroadcastf128 ymm5, xmmword ptr kAddUV128 |
| 1548 vbroadcastf128 ymm6, xmmword ptr kARGBToV | 1548 vbroadcastf128 ymm6, xmmword ptr kARGBToV |
| 1549 vbroadcastf128 ymm7, xmmword ptr kARGBToU | 1549 vbroadcastf128 ymm7, xmmword ptr kARGBToU |
| 1550 sub edi, edx // stride from u to v | 1550 sub edi, edx // stride from u to v |
| 1551 | 1551 |
| 1552 convertloop: | 1552 convertloop: |
| 1553 /* step 1 - subsample 32x2 argb pixels to 16x1 */ | 1553 /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
| 1554 vmovdqu ymm0, [eax] | 1554 vmovdqu ymm0, [eax] |
| 1555 vmovdqu ymm1, [eax + 32] | 1555 vmovdqu ymm1, [eax + 32] |
| 1556 vmovdqu ymm2, [eax + 64] | 1556 vmovdqu ymm2, [eax + 64] |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1599 #endif // HAS_ARGBTOUVROW_AVX2 | 1599 #endif // HAS_ARGBTOUVROW_AVX2 |
| 1600 | 1600 |
| 1601 __declspec(naked) | 1601 __declspec(naked) |
| 1602 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, | 1602 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
| 1603 uint8* dst_u, uint8* dst_v, int width) { | 1603 uint8* dst_u, uint8* dst_v, int width) { |
| 1604 __asm { | 1604 __asm { |
| 1605 push edi | 1605 push edi |
| 1606 mov eax, [esp + 4 + 4] // src_argb | 1606 mov eax, [esp + 4 + 4] // src_argb |
| 1607 mov edx, [esp + 4 + 8] // dst_u | 1607 mov edx, [esp + 4 + 8] // dst_u |
| 1608 mov edi, [esp + 4 + 12] // dst_v | 1608 mov edi, [esp + 4 + 12] // dst_v |
| 1609 mov ecx, [esp + 4 + 16] // pix | 1609 mov ecx, [esp + 4 + 16] // width |
| 1610 movdqa xmm5, xmmword ptr kAddUV128 | 1610 movdqa xmm5, xmmword ptr kAddUV128 |
| 1611 movdqa xmm6, xmmword ptr kARGBToV | 1611 movdqa xmm6, xmmword ptr kARGBToV |
| 1612 movdqa xmm7, xmmword ptr kARGBToU | 1612 movdqa xmm7, xmmword ptr kARGBToU |
| 1613 sub edi, edx // stride from u to v | 1613 sub edi, edx // stride from u to v |
| 1614 | 1614 |
| 1615 convertloop: | 1615 convertloop: |
| 1616 /* convert to U and V */ | 1616 /* convert to U and V */ |
| 1617 movdqu xmm0, [eax] // U | 1617 movdqu xmm0, [eax] // U |
| 1618 movdqu xmm1, [eax + 16] | 1618 movdqu xmm1, [eax + 16] |
| 1619 movdqu xmm2, [eax + 32] | 1619 movdqu xmm2, [eax + 32] |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1656 } | 1656 } |
| 1657 | 1657 |
| 1658 __declspec(naked) | 1658 __declspec(naked) |
| 1659 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | 1659 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
| 1660 uint8* dst_u, uint8* dst_v, int width) { | 1660 uint8* dst_u, uint8* dst_v, int width) { |
| 1661 __asm { | 1661 __asm { |
| 1662 push edi | 1662 push edi |
| 1663 mov eax, [esp + 4 + 4] // src_argb | 1663 mov eax, [esp + 4 + 4] // src_argb |
| 1664 mov edx, [esp + 4 + 8] // dst_u | 1664 mov edx, [esp + 4 + 8] // dst_u |
| 1665 mov edi, [esp + 4 + 12] // dst_v | 1665 mov edi, [esp + 4 + 12] // dst_v |
| 1666 mov ecx, [esp + 4 + 16] // pix | 1666 mov ecx, [esp + 4 + 16] // width |
| 1667 movdqa xmm5, xmmword ptr kAddUV128 | 1667 movdqa xmm5, xmmword ptr kAddUV128 |
| 1668 movdqa xmm6, xmmword ptr kARGBToV | 1668 movdqa xmm6, xmmword ptr kARGBToV |
| 1669 movdqa xmm7, xmmword ptr kARGBToU | 1669 movdqa xmm7, xmmword ptr kARGBToU |
| 1670 sub edi, edx // stride from u to v | 1670 sub edi, edx // stride from u to v |
| 1671 | 1671 |
| 1672 convertloop: | 1672 convertloop: |
| 1673 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1673 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1674 movdqu xmm0, [eax] | 1674 movdqu xmm0, [eax] |
| 1675 movdqu xmm1, [eax + 16] | 1675 movdqu xmm1, [eax + 16] |
| 1676 movdqu xmm2, [eax + 32] | 1676 movdqu xmm2, [eax + 32] |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1716 __declspec(naked) | 1716 __declspec(naked) |
| 1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1718 uint8* dst_u, uint8* dst_v, int width) { | 1718 uint8* dst_u, uint8* dst_v, int width) { |
| 1719 __asm { | 1719 __asm { |
| 1720 push esi | 1720 push esi |
| 1721 push edi | 1721 push edi |
| 1722 mov eax, [esp + 8 + 4] // src_argb | 1722 mov eax, [esp + 8 + 4] // src_argb |
| 1723 mov esi, [esp + 8 + 8] // src_stride_argb | 1723 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1724 mov edx, [esp + 8 + 12] // dst_u | 1724 mov edx, [esp + 8 + 12] // dst_u |
| 1725 mov edi, [esp + 8 + 16] // dst_v | 1725 mov edi, [esp + 8 + 16] // dst_v |
| 1726 mov ecx, [esp + 8 + 20] // pix | 1726 mov ecx, [esp + 8 + 20] // width |
| 1727 movdqa xmm5, xmmword ptr kAddUV128 | 1727 movdqa xmm5, xmmword ptr kAddUV128 |
| 1728 movdqa xmm6, xmmword ptr kBGRAToV | 1728 movdqa xmm6, xmmword ptr kBGRAToV |
| 1729 movdqa xmm7, xmmword ptr kBGRAToU | 1729 movdqa xmm7, xmmword ptr kBGRAToU |
| 1730 sub edi, edx // stride from u to v | 1730 sub edi, edx // stride from u to v |
| 1731 | 1731 |
| 1732 convertloop: | 1732 convertloop: |
| 1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1734 movdqu xmm0, [eax] | 1734 movdqu xmm0, [eax] |
| 1735 movdqu xmm4, [eax + esi] | 1735 movdqu xmm4, [eax + esi] |
| 1736 pavgb xmm0, xmm4 | 1736 pavgb xmm0, xmm4 |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1786 __declspec(naked) | 1786 __declspec(naked) |
| 1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1788 uint8* dst_u, uint8* dst_v, int width) { | 1788 uint8* dst_u, uint8* dst_v, int width) { |
| 1789 __asm { | 1789 __asm { |
| 1790 push esi | 1790 push esi |
| 1791 push edi | 1791 push edi |
| 1792 mov eax, [esp + 8 + 4] // src_argb | 1792 mov eax, [esp + 8 + 4] // src_argb |
| 1793 mov esi, [esp + 8 + 8] // src_stride_argb | 1793 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1794 mov edx, [esp + 8 + 12] // dst_u | 1794 mov edx, [esp + 8 + 12] // dst_u |
| 1795 mov edi, [esp + 8 + 16] // dst_v | 1795 mov edi, [esp + 8 + 16] // dst_v |
| 1796 mov ecx, [esp + 8 + 20] // pix | 1796 mov ecx, [esp + 8 + 20] // width |
| 1797 movdqa xmm5, xmmword ptr kAddUV128 | 1797 movdqa xmm5, xmmword ptr kAddUV128 |
| 1798 movdqa xmm6, xmmword ptr kABGRToV | 1798 movdqa xmm6, xmmword ptr kABGRToV |
| 1799 movdqa xmm7, xmmword ptr kABGRToU | 1799 movdqa xmm7, xmmword ptr kABGRToU |
| 1800 sub edi, edx // stride from u to v | 1800 sub edi, edx // stride from u to v |
| 1801 | 1801 |
| 1802 convertloop: | 1802 convertloop: |
| 1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1804 movdqu xmm0, [eax] | 1804 movdqu xmm0, [eax] |
| 1805 movdqu xmm4, [eax + esi] | 1805 movdqu xmm4, [eax + esi] |
| 1806 pavgb xmm0, xmm4 | 1806 pavgb xmm0, xmm4 |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1856 __declspec(naked) | 1856 __declspec(naked) |
| 1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1858 uint8* dst_u, uint8* dst_v, int width) { | 1858 uint8* dst_u, uint8* dst_v, int width) { |
| 1859 __asm { | 1859 __asm { |
| 1860 push esi | 1860 push esi |
| 1861 push edi | 1861 push edi |
| 1862 mov eax, [esp + 8 + 4] // src_argb | 1862 mov eax, [esp + 8 + 4] // src_argb |
| 1863 mov esi, [esp + 8 + 8] // src_stride_argb | 1863 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1864 mov edx, [esp + 8 + 12] // dst_u | 1864 mov edx, [esp + 8 + 12] // dst_u |
| 1865 mov edi, [esp + 8 + 16] // dst_v | 1865 mov edi, [esp + 8 + 16] // dst_v |
| 1866 mov ecx, [esp + 8 + 20] // pix | 1866 mov ecx, [esp + 8 + 20] // width |
| 1867 movdqa xmm5, xmmword ptr kAddUV128 | 1867 movdqa xmm5, xmmword ptr kAddUV128 |
| 1868 movdqa xmm6, xmmword ptr kRGBAToV | 1868 movdqa xmm6, xmmword ptr kRGBAToV |
| 1869 movdqa xmm7, xmmword ptr kRGBAToU | 1869 movdqa xmm7, xmmword ptr kRGBAToU |
| 1870 sub edi, edx // stride from u to v | 1870 sub edi, edx // stride from u to v |
| 1871 | 1871 |
| 1872 convertloop: | 1872 convertloop: |
| 1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ | 1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1874 movdqu xmm0, [eax] | 1874 movdqu xmm0, [eax] |
| 1875 movdqu xmm4, [eax + esi] | 1875 movdqu xmm4, [eax + esi] |
| 1876 pavgb xmm0, xmm4 | 1876 pavgb xmm0, xmm4 |
| (...skipping 1754 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3631 sub ecx, 8 | 3631 sub ecx, 8 |
| 3632 jg convertloop | 3632 jg convertloop |
| 3633 vzeroupper | 3633 vzeroupper |
| 3634 ret | 3634 ret |
| 3635 } | 3635 } |
| 3636 } | 3636 } |
| 3637 #endif // HAS_ARGBMIRRORROW_AVX2 | 3637 #endif // HAS_ARGBMIRRORROW_AVX2 |
| 3638 | 3638 |
| 3639 #ifdef HAS_SPLITUVROW_SSE2 | 3639 #ifdef HAS_SPLITUVROW_SSE2 |
| 3640 __declspec(naked) | 3640 __declspec(naked) |
| 3641 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3641 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width)
{ |
| 3642 __asm { | 3642 __asm { |
| 3643 push edi | 3643 push edi |
| 3644 mov eax, [esp + 4 + 4] // src_uv | 3644 mov eax, [esp + 4 + 4] // src_uv |
| 3645 mov edx, [esp + 4 + 8] // dst_u | 3645 mov edx, [esp + 4 + 8] // dst_u |
| 3646 mov edi, [esp + 4 + 12] // dst_v | 3646 mov edi, [esp + 4 + 12] // dst_v |
| 3647 mov ecx, [esp + 4 + 16] // pix | 3647 mov ecx, [esp + 4 + 16] // width |
| 3648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 3648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 3649 psrlw xmm5, 8 | 3649 psrlw xmm5, 8 |
| 3650 sub edi, edx | 3650 sub edi, edx |
| 3651 | 3651 |
| 3652 convertloop: | 3652 convertloop: |
| 3653 movdqu xmm0, [eax] | 3653 movdqu xmm0, [eax] |
| 3654 movdqu xmm1, [eax + 16] | 3654 movdqu xmm1, [eax + 16] |
| 3655 lea eax, [eax + 32] | 3655 lea eax, [eax + 32] |
| 3656 movdqa xmm2, xmm0 | 3656 movdqa xmm2, xmm0 |
| 3657 movdqa xmm3, xmm1 | 3657 movdqa xmm3, xmm1 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 3669 | 3669 |
| 3670 pop edi | 3670 pop edi |
| 3671 ret | 3671 ret |
| 3672 } | 3672 } |
| 3673 } | 3673 } |
| 3674 | 3674 |
| 3675 #endif // HAS_SPLITUVROW_SSE2 | 3675 #endif // HAS_SPLITUVROW_SSE2 |
| 3676 | 3676 |
| 3677 #ifdef HAS_SPLITUVROW_AVX2 | 3677 #ifdef HAS_SPLITUVROW_AVX2 |
| 3678 __declspec(naked) | 3678 __declspec(naked) |
| 3679 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | 3679 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width)
{ |
| 3680 __asm { | 3680 __asm { |
| 3681 push edi | 3681 push edi |
| 3682 mov eax, [esp + 4 + 4] // src_uv | 3682 mov eax, [esp + 4 + 4] // src_uv |
| 3683 mov edx, [esp + 4 + 8] // dst_u | 3683 mov edx, [esp + 4 + 8] // dst_u |
| 3684 mov edi, [esp + 4 + 12] // dst_v | 3684 mov edi, [esp + 4 + 12] // dst_v |
| 3685 mov ecx, [esp + 4 + 16] // pix | 3685 mov ecx, [esp + 4 + 16] // width |
| 3686 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3686 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3687 vpsrlw ymm5, ymm5, 8 | 3687 vpsrlw ymm5, ymm5, 8 |
| 3688 sub edi, edx | 3688 sub edi, edx |
| 3689 | 3689 |
| 3690 convertloop: | 3690 convertloop: |
| 3691 vmovdqu ymm0, [eax] | 3691 vmovdqu ymm0, [eax] |
| 3692 vmovdqu ymm1, [eax + 32] | 3692 vmovdqu ymm1, [eax + 32] |
| 3693 lea eax, [eax + 64] | 3693 lea eax, [eax + 64] |
| 3694 vpsrlw ymm2, ymm0, 8 // odd bytes | 3694 vpsrlw ymm2, ymm0, 8 // odd bytes |
| 3695 vpsrlw ymm3, ymm1, 8 | 3695 vpsrlw ymm3, ymm1, 8 |
| (...skipping 321 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4017 mov ecx, [esp + 12] // count | 4017 mov ecx, [esp + 12] // count |
| 4018 rep stosd | 4018 rep stosd |
| 4019 mov edi, edx | 4019 mov edi, edx |
| 4020 ret | 4020 ret |
| 4021 } | 4021 } |
| 4022 } | 4022 } |
| 4023 #endif // HAS_SETROW_X86 | 4023 #endif // HAS_SETROW_X86 |
| 4024 | 4024 |
| 4025 #ifdef HAS_YUY2TOYROW_AVX2 | 4025 #ifdef HAS_YUY2TOYROW_AVX2 |
| 4026 __declspec(naked) | 4026 __declspec(naked) |
| 4027 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { | 4027 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { |
| 4028 __asm { | 4028 __asm { |
| 4029 mov eax, [esp + 4] // src_yuy2 | 4029 mov eax, [esp + 4] // src_yuy2 |
| 4030 mov edx, [esp + 8] // dst_y | 4030 mov edx, [esp + 8] // dst_y |
| 4031 mov ecx, [esp + 12] // pix | 4031 mov ecx, [esp + 12] // width |
| 4032 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4032 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 4033 vpsrlw ymm5, ymm5, 8 | 4033 vpsrlw ymm5, ymm5, 8 |
| 4034 | 4034 |
| 4035 convertloop: | 4035 convertloop: |
| 4036 vmovdqu ymm0, [eax] | 4036 vmovdqu ymm0, [eax] |
| 4037 vmovdqu ymm1, [eax + 32] | 4037 vmovdqu ymm1, [eax + 32] |
| 4038 lea eax, [eax + 64] | 4038 lea eax, [eax + 64] |
| 4039 vpand ymm0, ymm0, ymm5 // even bytes are Y | 4039 vpand ymm0, ymm0, ymm5 // even bytes are Y |
| 4040 vpand ymm1, ymm1, ymm5 | 4040 vpand ymm1, ymm1, ymm5 |
| 4041 vpackuswb ymm0, ymm0, ymm1 // mutates. | 4041 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 4042 vpermq ymm0, ymm0, 0xd8 | 4042 vpermq ymm0, ymm0, 0xd8 |
| 4043 vmovdqu [edx], ymm0 | 4043 vmovdqu [edx], ymm0 |
| 4044 lea edx, [edx + 32] | 4044 lea edx, [edx + 32] |
| 4045 sub ecx, 32 | 4045 sub ecx, 32 |
| 4046 jg convertloop | 4046 jg convertloop |
| 4047 vzeroupper | 4047 vzeroupper |
| 4048 ret | 4048 ret |
| 4049 } | 4049 } |
| 4050 } | 4050 } |
| 4051 | 4051 |
| 4052 __declspec(naked) | 4052 __declspec(naked) |
| 4053 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 4053 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
| 4054 uint8* dst_u, uint8* dst_v, int pix) { | 4054 uint8* dst_u, uint8* dst_v, int width) { |
| 4055 __asm { | 4055 __asm { |
| 4056 push esi | 4056 push esi |
| 4057 push edi | 4057 push edi |
| 4058 mov eax, [esp + 8 + 4] // src_yuy2 | 4058 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4059 mov esi, [esp + 8 + 8] // stride_yuy2 | 4059 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4060 mov edx, [esp + 8 + 12] // dst_u | 4060 mov edx, [esp + 8 + 12] // dst_u |
| 4061 mov edi, [esp + 8 + 16] // dst_v | 4061 mov edi, [esp + 8 + 16] // dst_v |
| 4062 mov ecx, [esp + 8 + 20] // pix | 4062 mov ecx, [esp + 8 + 20] // width |
| 4063 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4063 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 4064 vpsrlw ymm5, ymm5, 8 | 4064 vpsrlw ymm5, ymm5, 8 |
| 4065 sub edi, edx | 4065 sub edi, edx |
| 4066 | 4066 |
| 4067 convertloop: | 4067 convertloop: |
| 4068 vmovdqu ymm0, [eax] | 4068 vmovdqu ymm0, [eax] |
| 4069 vmovdqu ymm1, [eax + 32] | 4069 vmovdqu ymm1, [eax + 32] |
| 4070 vpavgb ymm0, ymm0, [eax + esi] | 4070 vpavgb ymm0, ymm0, [eax + esi] |
| 4071 vpavgb ymm1, ymm1, [eax + esi + 32] | 4071 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 4072 lea eax, [eax + 64] | 4072 lea eax, [eax + 64] |
| (...skipping 15 matching lines...) Expand all Loading... |
| 4088 | 4088 |
| 4089 pop edi | 4089 pop edi |
| 4090 pop esi | 4090 pop esi |
| 4091 vzeroupper | 4091 vzeroupper |
| 4092 ret | 4092 ret |
| 4093 } | 4093 } |
| 4094 } | 4094 } |
| 4095 | 4095 |
| 4096 __declspec(naked) | 4096 __declspec(naked) |
| 4097 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 4097 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
| 4098 uint8* dst_u, uint8* dst_v, int pix) { | 4098 uint8* dst_u, uint8* dst_v, int width) { |
| 4099 __asm { | 4099 __asm { |
| 4100 push edi | 4100 push edi |
| 4101 mov eax, [esp + 4 + 4] // src_yuy2 | 4101 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4102 mov edx, [esp + 4 + 8] // dst_u | 4102 mov edx, [esp + 4 + 8] // dst_u |
| 4103 mov edi, [esp + 4 + 12] // dst_v | 4103 mov edi, [esp + 4 + 12] // dst_v |
| 4104 mov ecx, [esp + 4 + 16] // pix | 4104 mov ecx, [esp + 4 + 16] // width |
| 4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 4106 vpsrlw ymm5, ymm5, 8 | 4106 vpsrlw ymm5, ymm5, 8 |
| 4107 sub edi, edx | 4107 sub edi, edx |
| 4108 | 4108 |
| 4109 convertloop: | 4109 convertloop: |
| 4110 vmovdqu ymm0, [eax] | 4110 vmovdqu ymm0, [eax] |
| 4111 vmovdqu ymm1, [eax + 32] | 4111 vmovdqu ymm1, [eax + 32] |
| 4112 lea eax, [eax + 64] | 4112 lea eax, [eax + 64] |
| 4113 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV | 4113 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
| 4114 vpsrlw ymm1, ymm1, 8 | 4114 vpsrlw ymm1, ymm1, 8 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 4127 jg convertloop | 4127 jg convertloop |
| 4128 | 4128 |
| 4129 pop edi | 4129 pop edi |
| 4130 vzeroupper | 4130 vzeroupper |
| 4131 ret | 4131 ret |
| 4132 } | 4132 } |
| 4133 } | 4133 } |
| 4134 | 4134 |
| 4135 __declspec(naked) | 4135 __declspec(naked) |
| 4136 void UYVYToYRow_AVX2(const uint8* src_uyvy, | 4136 void UYVYToYRow_AVX2(const uint8* src_uyvy, |
| 4137 uint8* dst_y, int pix) { | 4137 uint8* dst_y, int width) { |
| 4138 __asm { | 4138 __asm { |
| 4139 mov eax, [esp + 4] // src_uyvy | 4139 mov eax, [esp + 4] // src_uyvy |
| 4140 mov edx, [esp + 8] // dst_y | 4140 mov edx, [esp + 8] // dst_y |
| 4141 mov ecx, [esp + 12] // pix | 4141 mov ecx, [esp + 12] // width |
| 4142 | 4142 |
| 4143 convertloop: | 4143 convertloop: |
| 4144 vmovdqu ymm0, [eax] | 4144 vmovdqu ymm0, [eax] |
| 4145 vmovdqu ymm1, [eax + 32] | 4145 vmovdqu ymm1, [eax + 32] |
| 4146 lea eax, [eax + 64] | 4146 lea eax, [eax + 64] |
| 4147 vpsrlw ymm0, ymm0, 8 // odd bytes are Y | 4147 vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
| 4148 vpsrlw ymm1, ymm1, 8 | 4148 vpsrlw ymm1, ymm1, 8 |
| 4149 vpackuswb ymm0, ymm0, ymm1 // mutates. | 4149 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 4150 vpermq ymm0, ymm0, 0xd8 | 4150 vpermq ymm0, ymm0, 0xd8 |
| 4151 vmovdqu [edx], ymm0 | 4151 vmovdqu [edx], ymm0 |
| 4152 lea edx, [edx + 32] | 4152 lea edx, [edx + 32] |
| 4153 sub ecx, 32 | 4153 sub ecx, 32 |
| 4154 jg convertloop | 4154 jg convertloop |
| 4155 vzeroupper | 4155 vzeroupper |
| 4156 ret | 4156 ret |
| 4157 } | 4157 } |
| 4158 } | 4158 } |
| 4159 | 4159 |
| 4160 __declspec(naked) | 4160 __declspec(naked) |
| 4161 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 4161 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
| 4162 uint8* dst_u, uint8* dst_v, int pix) { | 4162 uint8* dst_u, uint8* dst_v, int width) { |
| 4163 __asm { | 4163 __asm { |
| 4164 push esi | 4164 push esi |
| 4165 push edi | 4165 push edi |
| 4166 mov eax, [esp + 8 + 4] // src_yuy2 | 4166 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4167 mov esi, [esp + 8 + 8] // stride_yuy2 | 4167 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4168 mov edx, [esp + 8 + 12] // dst_u | 4168 mov edx, [esp + 8 + 12] // dst_u |
| 4169 mov edi, [esp + 8 + 16] // dst_v | 4169 mov edi, [esp + 8 + 16] // dst_v |
| 4170 mov ecx, [esp + 8 + 20] // pix | 4170 mov ecx, [esp + 8 + 20] // width |
| 4171 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4171 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 4172 vpsrlw ymm5, ymm5, 8 | 4172 vpsrlw ymm5, ymm5, 8 |
| 4173 sub edi, edx | 4173 sub edi, edx |
| 4174 | 4174 |
| 4175 convertloop: | 4175 convertloop: |
| 4176 vmovdqu ymm0, [eax] | 4176 vmovdqu ymm0, [eax] |
| 4177 vmovdqu ymm1, [eax + 32] | 4177 vmovdqu ymm1, [eax + 32] |
| 4178 vpavgb ymm0, ymm0, [eax + esi] | 4178 vpavgb ymm0, ymm0, [eax + esi] |
| 4179 vpavgb ymm1, ymm1, [eax + esi + 32] | 4179 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 4180 lea eax, [eax + 64] | 4180 lea eax, [eax + 64] |
| (...skipping 15 matching lines...) Expand all Loading... |
| 4196 | 4196 |
| 4197 pop edi | 4197 pop edi |
| 4198 pop esi | 4198 pop esi |
| 4199 vzeroupper | 4199 vzeroupper |
| 4200 ret | 4200 ret |
| 4201 } | 4201 } |
| 4202 } | 4202 } |
| 4203 | 4203 |
| 4204 __declspec(naked) | 4204 __declspec(naked) |
| 4205 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 4205 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
| 4206 uint8* dst_u, uint8* dst_v, int pix) { | 4206 uint8* dst_u, uint8* dst_v, int width) { |
| 4207 __asm { | 4207 __asm { |
| 4208 push edi | 4208 push edi |
| 4209 mov eax, [esp + 4 + 4] // src_yuy2 | 4209 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4210 mov edx, [esp + 4 + 8] // dst_u | 4210 mov edx, [esp + 4 + 8] // dst_u |
| 4211 mov edi, [esp + 4 + 12] // dst_v | 4211 mov edi, [esp + 4 + 12] // dst_v |
| 4212 mov ecx, [esp + 4 + 16] // pix | 4212 mov ecx, [esp + 4 + 16] // width |
| 4213 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 4213 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 4214 vpsrlw ymm5, ymm5, 8 | 4214 vpsrlw ymm5, ymm5, 8 |
| 4215 sub edi, edx | 4215 sub edi, edx |
| 4216 | 4216 |
| 4217 convertloop: | 4217 convertloop: |
| 4218 vmovdqu ymm0, [eax] | 4218 vmovdqu ymm0, [eax] |
| 4219 vmovdqu ymm1, [eax + 32] | 4219 vmovdqu ymm1, [eax + 32] |
| 4220 lea eax, [eax + 64] | 4220 lea eax, [eax + 64] |
| 4221 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV | 4221 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
| 4222 vpand ymm1, ymm1, ymm5 | 4222 vpand ymm1, ymm1, ymm5 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 4237 pop edi | 4237 pop edi |
| 4238 vzeroupper | 4238 vzeroupper |
| 4239 ret | 4239 ret |
| 4240 } | 4240 } |
| 4241 } | 4241 } |
| 4242 #endif // HAS_YUY2TOYROW_AVX2 | 4242 #endif // HAS_YUY2TOYROW_AVX2 |
| 4243 | 4243 |
| 4244 #ifdef HAS_YUY2TOYROW_SSE2 | 4244 #ifdef HAS_YUY2TOYROW_SSE2 |
| 4245 __declspec(naked) | 4245 __declspec(naked) |
| 4246 void YUY2ToYRow_SSE2(const uint8* src_yuy2, | 4246 void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
| 4247 uint8* dst_y, int pix) { | 4247 uint8* dst_y, int width) { |
| 4248 __asm { | 4248 __asm { |
| 4249 mov eax, [esp + 4] // src_yuy2 | 4249 mov eax, [esp + 4] // src_yuy2 |
| 4250 mov edx, [esp + 8] // dst_y | 4250 mov edx, [esp + 8] // dst_y |
| 4251 mov ecx, [esp + 12] // pix | 4251 mov ecx, [esp + 12] // width |
| 4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4253 psrlw xmm5, 8 | 4253 psrlw xmm5, 8 |
| 4254 | 4254 |
| 4255 convertloop: | 4255 convertloop: |
| 4256 movdqu xmm0, [eax] | 4256 movdqu xmm0, [eax] |
| 4257 movdqu xmm1, [eax + 16] | 4257 movdqu xmm1, [eax + 16] |
| 4258 lea eax, [eax + 32] | 4258 lea eax, [eax + 32] |
| 4259 pand xmm0, xmm5 // even bytes are Y | 4259 pand xmm0, xmm5 // even bytes are Y |
| 4260 pand xmm1, xmm5 | 4260 pand xmm1, xmm5 |
| 4261 packuswb xmm0, xmm1 | 4261 packuswb xmm0, xmm1 |
| 4262 movdqu [edx], xmm0 | 4262 movdqu [edx], xmm0 |
| 4263 lea edx, [edx + 16] | 4263 lea edx, [edx + 16] |
| 4264 sub ecx, 16 | 4264 sub ecx, 16 |
| 4265 jg convertloop | 4265 jg convertloop |
| 4266 ret | 4266 ret |
| 4267 } | 4267 } |
| 4268 } | 4268 } |
| 4269 | 4269 |
| 4270 __declspec(naked) | 4270 __declspec(naked) |
| 4271 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 4271 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
| 4272 uint8* dst_u, uint8* dst_v, int pix) { | 4272 uint8* dst_u, uint8* dst_v, int width) { |
| 4273 __asm { | 4273 __asm { |
| 4274 push esi | 4274 push esi |
| 4275 push edi | 4275 push edi |
| 4276 mov eax, [esp + 8 + 4] // src_yuy2 | 4276 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4277 mov esi, [esp + 8 + 8] // stride_yuy2 | 4277 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4278 mov edx, [esp + 8 + 12] // dst_u | 4278 mov edx, [esp + 8 + 12] // dst_u |
| 4279 mov edi, [esp + 8 + 16] // dst_v | 4279 mov edi, [esp + 8 + 16] // dst_v |
| 4280 mov ecx, [esp + 8 + 20] // pix | 4280 mov ecx, [esp + 8 + 20] // width |
| 4281 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4281 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4282 psrlw xmm5, 8 | 4282 psrlw xmm5, 8 |
| 4283 sub edi, edx | 4283 sub edi, edx |
| 4284 | 4284 |
| 4285 convertloop: | 4285 convertloop: |
| 4286 movdqu xmm0, [eax] | 4286 movdqu xmm0, [eax] |
| 4287 movdqu xmm1, [eax + 16] | 4287 movdqu xmm1, [eax + 16] |
| 4288 movdqu xmm2, [eax + esi] | 4288 movdqu xmm2, [eax + esi] |
| 4289 movdqu xmm3, [eax + esi + 16] | 4289 movdqu xmm3, [eax + esi + 16] |
| 4290 lea eax, [eax + 32] | 4290 lea eax, [eax + 32] |
| (...skipping 14 matching lines...) Expand all Loading... |
| 4305 jg convertloop | 4305 jg convertloop |
| 4306 | 4306 |
| 4307 pop edi | 4307 pop edi |
| 4308 pop esi | 4308 pop esi |
| 4309 ret | 4309 ret |
| 4310 } | 4310 } |
| 4311 } | 4311 } |
| 4312 | 4312 |
| 4313 __declspec(naked) | 4313 __declspec(naked) |
| 4314 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 4314 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
| 4315 uint8* dst_u, uint8* dst_v, int pix) { | 4315 uint8* dst_u, uint8* dst_v, int width) { |
| 4316 __asm { | 4316 __asm { |
| 4317 push edi | 4317 push edi |
| 4318 mov eax, [esp + 4 + 4] // src_yuy2 | 4318 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4319 mov edx, [esp + 4 + 8] // dst_u | 4319 mov edx, [esp + 4 + 8] // dst_u |
| 4320 mov edi, [esp + 4 + 12] // dst_v | 4320 mov edi, [esp + 4 + 12] // dst_v |
| 4321 mov ecx, [esp + 4 + 16] // pix | 4321 mov ecx, [esp + 4 + 16] // width |
| 4322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4323 psrlw xmm5, 8 | 4323 psrlw xmm5, 8 |
| 4324 sub edi, edx | 4324 sub edi, edx |
| 4325 | 4325 |
| 4326 convertloop: | 4326 convertloop: |
| 4327 movdqu xmm0, [eax] | 4327 movdqu xmm0, [eax] |
| 4328 movdqu xmm1, [eax + 16] | 4328 movdqu xmm1, [eax + 16] |
| 4329 lea eax, [eax + 32] | 4329 lea eax, [eax + 32] |
| 4330 psrlw xmm0, 8 // YUYV -> UVUV | 4330 psrlw xmm0, 8 // YUYV -> UVUV |
| 4331 psrlw xmm1, 8 | 4331 psrlw xmm1, 8 |
| 4332 packuswb xmm0, xmm1 | 4332 packuswb xmm0, xmm1 |
| 4333 movdqa xmm1, xmm0 | 4333 movdqa xmm1, xmm0 |
| 4334 pand xmm0, xmm5 // U | 4334 pand xmm0, xmm5 // U |
| 4335 packuswb xmm0, xmm0 | 4335 packuswb xmm0, xmm0 |
| 4336 psrlw xmm1, 8 // V | 4336 psrlw xmm1, 8 // V |
| 4337 packuswb xmm1, xmm1 | 4337 packuswb xmm1, xmm1 |
| 4338 movq qword ptr [edx], xmm0 | 4338 movq qword ptr [edx], xmm0 |
| 4339 movq qword ptr [edx + edi], xmm1 | 4339 movq qword ptr [edx + edi], xmm1 |
| 4340 lea edx, [edx + 8] | 4340 lea edx, [edx + 8] |
| 4341 sub ecx, 16 | 4341 sub ecx, 16 |
| 4342 jg convertloop | 4342 jg convertloop |
| 4343 | 4343 |
| 4344 pop edi | 4344 pop edi |
| 4345 ret | 4345 ret |
| 4346 } | 4346 } |
| 4347 } | 4347 } |
| 4348 | 4348 |
| 4349 __declspec(naked) | 4349 __declspec(naked) |
| 4350 void UYVYToYRow_SSE2(const uint8* src_uyvy, | 4350 void UYVYToYRow_SSE2(const uint8* src_uyvy, |
| 4351 uint8* dst_y, int pix) { | 4351 uint8* dst_y, int width) { |
| 4352 __asm { | 4352 __asm { |
| 4353 mov eax, [esp + 4] // src_uyvy | 4353 mov eax, [esp + 4] // src_uyvy |
| 4354 mov edx, [esp + 8] // dst_y | 4354 mov edx, [esp + 8] // dst_y |
| 4355 mov ecx, [esp + 12] // pix | 4355 mov ecx, [esp + 12] // width |
| 4356 | 4356 |
| 4357 convertloop: | 4357 convertloop: |
| 4358 movdqu xmm0, [eax] | 4358 movdqu xmm0, [eax] |
| 4359 movdqu xmm1, [eax + 16] | 4359 movdqu xmm1, [eax + 16] |
| 4360 lea eax, [eax + 32] | 4360 lea eax, [eax + 32] |
| 4361 psrlw xmm0, 8 // odd bytes are Y | 4361 psrlw xmm0, 8 // odd bytes are Y |
| 4362 psrlw xmm1, 8 | 4362 psrlw xmm1, 8 |
| 4363 packuswb xmm0, xmm1 | 4363 packuswb xmm0, xmm1 |
| 4364 movdqu [edx], xmm0 | 4364 movdqu [edx], xmm0 |
| 4365 lea edx, [edx + 16] | 4365 lea edx, [edx + 16] |
| 4366 sub ecx, 16 | 4366 sub ecx, 16 |
| 4367 jg convertloop | 4367 jg convertloop |
| 4368 ret | 4368 ret |
| 4369 } | 4369 } |
| 4370 } | 4370 } |
| 4371 | 4371 |
| 4372 __declspec(naked) | 4372 __declspec(naked) |
| 4373 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 4373 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
| 4374 uint8* dst_u, uint8* dst_v, int pix) { | 4374 uint8* dst_u, uint8* dst_v, int width) { |
| 4375 __asm { | 4375 __asm { |
| 4376 push esi | 4376 push esi |
| 4377 push edi | 4377 push edi |
| 4378 mov eax, [esp + 8 + 4] // src_yuy2 | 4378 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4379 mov esi, [esp + 8 + 8] // stride_yuy2 | 4379 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4380 mov edx, [esp + 8 + 12] // dst_u | 4380 mov edx, [esp + 8 + 12] // dst_u |
| 4381 mov edi, [esp + 8 + 16] // dst_v | 4381 mov edi, [esp + 8 + 16] // dst_v |
| 4382 mov ecx, [esp + 8 + 20] // pix | 4382 mov ecx, [esp + 8 + 20] // width |
| 4383 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4383 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4384 psrlw xmm5, 8 | 4384 psrlw xmm5, 8 |
| 4385 sub edi, edx | 4385 sub edi, edx |
| 4386 | 4386 |
| 4387 convertloop: | 4387 convertloop: |
| 4388 movdqu xmm0, [eax] | 4388 movdqu xmm0, [eax] |
| 4389 movdqu xmm1, [eax + 16] | 4389 movdqu xmm1, [eax + 16] |
| 4390 movdqu xmm2, [eax + esi] | 4390 movdqu xmm2, [eax + esi] |
| 4391 movdqu xmm3, [eax + esi + 16] | 4391 movdqu xmm3, [eax + esi + 16] |
| 4392 lea eax, [eax + 32] | 4392 lea eax, [eax + 32] |
| (...skipping 14 matching lines...) Expand all Loading... |
| 4407 jg convertloop | 4407 jg convertloop |
| 4408 | 4408 |
| 4409 pop edi | 4409 pop edi |
| 4410 pop esi | 4410 pop esi |
| 4411 ret | 4411 ret |
| 4412 } | 4412 } |
| 4413 } | 4413 } |
| 4414 | 4414 |
| 4415 __declspec(naked) | 4415 __declspec(naked) |
| 4416 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 4416 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
| 4417 uint8* dst_u, uint8* dst_v, int pix) { | 4417 uint8* dst_u, uint8* dst_v, int width) { |
| 4418 __asm { | 4418 __asm { |
| 4419 push edi | 4419 push edi |
| 4420 mov eax, [esp + 4 + 4] // src_yuy2 | 4420 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4421 mov edx, [esp + 4 + 8] // dst_u | 4421 mov edx, [esp + 4 + 8] // dst_u |
| 4422 mov edi, [esp + 4 + 12] // dst_v | 4422 mov edi, [esp + 4 + 12] // dst_v |
| 4423 mov ecx, [esp + 4 + 16] // pix | 4423 mov ecx, [esp + 4 + 16] // width |
| 4424 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 4424 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4425 psrlw xmm5, 8 | 4425 psrlw xmm5, 8 |
| 4426 sub edi, edx | 4426 sub edi, edx |
| 4427 | 4427 |
| 4428 convertloop: | 4428 convertloop: |
| 4429 movdqu xmm0, [eax] | 4429 movdqu xmm0, [eax] |
| 4430 movdqu xmm1, [eax + 16] | 4430 movdqu xmm1, [eax + 16] |
| 4431 lea eax, [eax + 32] | 4431 lea eax, [eax + 32] |
| 4432 pand xmm0, xmm5 // UYVY -> UVUV | 4432 pand xmm0, xmm5 // UYVY -> UVUV |
| 4433 pand xmm1, xmm5 | 4433 pand xmm1, xmm5 |
| (...skipping 1685 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6119 pop edi | 6119 pop edi |
| 6120 pop esi | 6120 pop esi |
| 6121 ret | 6121 ret |
| 6122 } | 6122 } |
| 6123 } | 6123 } |
| 6124 #endif // HAS_INTERPOLATEROW_SSE2 | 6124 #endif // HAS_INTERPOLATEROW_SSE2 |
| 6125 | 6125 |
| 6126 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 6126 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 6127 __declspec(naked) | 6127 __declspec(naked) |
| 6128 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 6128 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 6129 const uint8* shuffler, int pix) { | 6129 const uint8* shuffler, int width) { |
| 6130 __asm { | 6130 __asm { |
| 6131 mov eax, [esp + 4] // src_argb | 6131 mov eax, [esp + 4] // src_argb |
| 6132 mov edx, [esp + 8] // dst_argb | 6132 mov edx, [esp + 8] // dst_argb |
| 6133 mov ecx, [esp + 12] // shuffler | 6133 mov ecx, [esp + 12] // shuffler |
| 6134 movdqu xmm5, [ecx] | 6134 movdqu xmm5, [ecx] |
| 6135 mov ecx, [esp + 16] // pix | 6135 mov ecx, [esp + 16] // width |
| 6136 | 6136 |
| 6137 wloop: | 6137 wloop: |
| 6138 movdqu xmm0, [eax] | 6138 movdqu xmm0, [eax] |
| 6139 movdqu xmm1, [eax + 16] | 6139 movdqu xmm1, [eax + 16] |
| 6140 lea eax, [eax + 32] | 6140 lea eax, [eax + 32] |
| 6141 pshufb xmm0, xmm5 | 6141 pshufb xmm0, xmm5 |
| 6142 pshufb xmm1, xmm5 | 6142 pshufb xmm1, xmm5 |
| 6143 movdqu [edx], xmm0 | 6143 movdqu [edx], xmm0 |
| 6144 movdqu [edx + 16], xmm1 | 6144 movdqu [edx + 16], xmm1 |
| 6145 lea edx, [edx + 32] | 6145 lea edx, [edx + 32] |
| 6146 sub ecx, 8 | 6146 sub ecx, 8 |
| 6147 jg wloop | 6147 jg wloop |
| 6148 ret | 6148 ret |
| 6149 } | 6149 } |
| 6150 } | 6150 } |
| 6151 | 6151 |
| 6152 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 6152 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
| 6153 __declspec(naked) | 6153 __declspec(naked) |
| 6154 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 6154 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 6155 const uint8* shuffler, int pix) { | 6155 const uint8* shuffler, int width) { |
| 6156 __asm { | 6156 __asm { |
| 6157 mov eax, [esp + 4] // src_argb | 6157 mov eax, [esp + 4] // src_argb |
| 6158 mov edx, [esp + 8] // dst_argb | 6158 mov edx, [esp + 8] // dst_argb |
| 6159 mov ecx, [esp + 12] // shuffler | 6159 mov ecx, [esp + 12] // shuffler |
| 6160 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. | 6160 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
| 6161 mov ecx, [esp + 16] // pix | 6161 mov ecx, [esp + 16] // width |
| 6162 | 6162 |
| 6163 wloop: | 6163 wloop: |
| 6164 vmovdqu ymm0, [eax] | 6164 vmovdqu ymm0, [eax] |
| 6165 vmovdqu ymm1, [eax + 32] | 6165 vmovdqu ymm1, [eax + 32] |
| 6166 lea eax, [eax + 64] | 6166 lea eax, [eax + 64] |
| 6167 vpshufb ymm0, ymm0, ymm5 | 6167 vpshufb ymm0, ymm0, ymm5 |
| 6168 vpshufb ymm1, ymm1, ymm5 | 6168 vpshufb ymm1, ymm1, ymm5 |
| 6169 vmovdqu [edx], ymm0 | 6169 vmovdqu [edx], ymm0 |
| 6170 vmovdqu [edx + 32], ymm1 | 6170 vmovdqu [edx + 32], ymm1 |
| 6171 lea edx, [edx + 64] | 6171 lea edx, [edx + 64] |
| 6172 sub ecx, 16 | 6172 sub ecx, 16 |
| 6173 jg wloop | 6173 jg wloop |
| 6174 | 6174 |
| 6175 vzeroupper | 6175 vzeroupper |
| 6176 ret | 6176 ret |
| 6177 } | 6177 } |
| 6178 } | 6178 } |
| 6179 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 6179 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
| 6180 | 6180 |
| 6181 __declspec(naked) | 6181 __declspec(naked) |
| 6182 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 6182 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 6183 const uint8* shuffler, int pix) { | 6183 const uint8* shuffler, int width) { |
| 6184 __asm { | 6184 __asm { |
| 6185 push ebx | 6185 push ebx |
| 6186 push esi | 6186 push esi |
| 6187 mov eax, [esp + 8 + 4] // src_argb | 6187 mov eax, [esp + 8 + 4] // src_argb |
| 6188 mov edx, [esp + 8 + 8] // dst_argb | 6188 mov edx, [esp + 8 + 8] // dst_argb |
| 6189 mov esi, [esp + 8 + 12] // shuffler | 6189 mov esi, [esp + 8 + 12] // shuffler |
| 6190 mov ecx, [esp + 8 + 16] // pix | 6190 mov ecx, [esp + 8 + 16] // width |
| 6191 pxor xmm5, xmm5 | 6191 pxor xmm5, xmm5 |
| 6192 | 6192 |
| 6193 mov ebx, [esi] // shuffler | 6193 mov ebx, [esi] // shuffler |
| 6194 cmp ebx, 0x03000102 | 6194 cmp ebx, 0x03000102 |
| 6195 je shuf_3012 | 6195 je shuf_3012 |
| 6196 cmp ebx, 0x00010203 | 6196 cmp ebx, 0x00010203 |
| 6197 je shuf_0123 | 6197 je shuf_0123 |
| 6198 cmp ebx, 0x00030201 | 6198 cmp ebx, 0x00030201 |
| 6199 je shuf_0321 | 6199 je shuf_0321 |
| 6200 cmp ebx, 0x02010003 | 6200 cmp ebx, 0x02010003 |
| (...skipping 435 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6636 } | 6636 } |
| 6637 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6637 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6638 | 6638 |
| 6639 #endif // defined(_M_X64) | 6639 #endif // defined(_M_X64) |
| 6640 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6640 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6641 | 6641 |
| 6642 #ifdef __cplusplus | 6642 #ifdef __cplusplus |
| 6643 } // extern "C" | 6643 } // extern "C" |
| 6644 } // namespace libyuv | 6644 } // namespace libyuv |
| 6645 #endif | 6645 #endif |
| OLD | NEW |