OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 5553 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5564 mov edi, [esp + 8 + 4] // dst_ptr | 5564 mov edi, [esp + 8 + 4] // dst_ptr |
5565 mov esi, [esp + 8 + 8] // src_ptr | 5565 mov esi, [esp + 8 + 8] // src_ptr |
5566 mov edx, [esp + 8 + 12] // src_stride | 5566 mov edx, [esp + 8 + 12] // src_stride |
5567 mov ecx, [esp + 8 + 16] // dst_width | 5567 mov ecx, [esp + 8 + 16] // dst_width |
5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | 5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
5569 shr eax, 1 | 5569 shr eax, 1 |
5570 // Dispatch to specialized filters if applicable. | 5570 // Dispatch to specialized filters if applicable. |
5571 cmp eax, 0 | 5571 cmp eax, 0 |
5572 je xloop100 // 0 / 128. Blend 100 / 0. | 5572 je xloop100 // 0 / 128. Blend 100 / 0. |
5573 sub edi, esi | 5573 sub edi, esi |
5574 cmp eax, 32 | |
5575 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. | |
5576 cmp eax, 64 | 5574 cmp eax, 64 |
5577 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. | 5575 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
5578 cmp eax, 96 | |
5579 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. | |
5580 | 5576 |
5581 vmovd xmm0, eax // high fraction 0..127 | 5577 vmovd xmm0, eax // high fraction 0..127 |
5582 neg eax | 5578 neg eax |
5583 add eax, 128 | 5579 add eax, 128 |
5584 vmovd xmm5, eax // low fraction 128..1 | 5580 vmovd xmm5, eax // low fraction 128..1 |
5585 vpunpcklbw xmm5, xmm5, xmm0 | 5581 vpunpcklbw xmm5, xmm5, xmm0 |
5586 vpunpcklwd xmm5, xmm5, xmm5 | 5582 vpunpcklwd xmm5, xmm5, xmm5 |
5587 vpxor ymm0, ymm0, ymm0 | 5583 vpxor ymm0, ymm0, ymm0 |
5588 vpermd ymm5, ymm0, ymm5 | 5584 vpermd ymm5, ymm0, ymm5 |
5589 | 5585 |
| 5586 mov eax, 0x00400040 // 64 for rounding. |
| 5587 vmovd xmm4, eax |
| 5588 vbroadcastss ymm4, xmm4 |
| 5589 |
5590 xloop: | 5590 xloop: |
5591 vmovdqu ymm0, [esi] | 5591 vmovdqu ymm0, [esi] |
5592 vmovdqu ymm2, [esi + edx] | 5592 vmovdqu ymm2, [esi + edx] |
5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates | 5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates |
5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates | 5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates |
5595 vpmaddubsw ymm0, ymm0, ymm5 | 5595 vpmaddubsw ymm0, ymm0, ymm5 |
5596 vpmaddubsw ymm1, ymm1, ymm5 | 5596 vpmaddubsw ymm1, ymm1, ymm5 |
| 5597 vpaddw ymm0, ymm0, ymm4 |
| 5598 vpaddw ymm1, ymm1, ymm4 |
5597 vpsrlw ymm0, ymm0, 7 | 5599 vpsrlw ymm0, ymm0, 7 |
5598 vpsrlw ymm1, ymm1, 7 | 5600 vpsrlw ymm1, ymm1, 7 |
5599 vpackuswb ymm0, ymm0, ymm1 // unmutates | 5601 vpackuswb ymm0, ymm0, ymm1 // unmutates |
5600 vmovdqu [esi + edi], ymm0 | 5602 vmovdqu [esi + edi], ymm0 |
5601 lea esi, [esi + 32] | 5603 lea esi, [esi + 32] |
5602 sub ecx, 32 | 5604 sub ecx, 32 |
5603 jg xloop | 5605 jg xloop |
5604 jmp xloop99 | 5606 jmp xloop99 |
5605 | 5607 |
5606 // Blend 25 / 75. | |
5607 xloop25: | |
5608 vmovdqu ymm0, [esi] | |
5609 vmovdqu ymm1, [esi + edx] | |
5610 vpavgb ymm0, ymm0, ymm1 | |
5611 vpavgb ymm0, ymm0, ymm1 | |
5612 vmovdqu [esi + edi], ymm0 | |
5613 lea esi, [esi + 32] | |
5614 sub ecx, 32 | |
5615 jg xloop25 | |
5616 jmp xloop99 | |
5617 | |
5618 // Blend 50 / 50. | 5608 // Blend 50 / 50. |
5619 xloop50: | 5609 xloop50: |
5620 vmovdqu ymm0, [esi] | 5610 vmovdqu ymm0, [esi] |
5621 vpavgb ymm0, ymm0, [esi + edx] | 5611 vpavgb ymm0, ymm0, [esi + edx] |
5622 vmovdqu [esi + edi], ymm0 | 5612 vmovdqu [esi + edi], ymm0 |
5623 lea esi, [esi + 32] | 5613 lea esi, [esi + 32] |
5624 sub ecx, 32 | 5614 sub ecx, 32 |
5625 jg xloop50 | 5615 jg xloop50 |
5626 jmp xloop99 | 5616 jmp xloop99 |
5627 | 5617 |
5628 // Blend 75 / 25. | |
5629 xloop75: | |
5630 vmovdqu ymm1, [esi] | |
5631 vmovdqu ymm0, [esi + edx] | |
5632 vpavgb ymm0, ymm0, ymm1 | |
5633 vpavgb ymm0, ymm0, ymm1 | |
5634 vmovdqu [esi + edi], ymm0 | |
5635 lea esi, [esi + 32] | |
5636 sub ecx, 32 | |
5637 jg xloop75 | |
5638 jmp xloop99 | |
5639 | |
5640 // Blend 100 / 0 - Copy row unchanged. | 5618 // Blend 100 / 0 - Copy row unchanged. |
5641 xloop100: | 5619 xloop100: |
5642 rep movsb | 5620 rep movsb |
5643 | 5621 |
5644 xloop99: | 5622 xloop99: |
5645 pop edi | 5623 pop edi |
5646 pop esi | 5624 pop esi |
5647 vzeroupper | 5625 vzeroupper |
5648 ret | 5626 ret |
5649 } | 5627 } |
(...skipping 11 matching lines...) Expand all Loading... |
5661 mov edi, [esp + 8 + 4] // dst_ptr | 5639 mov edi, [esp + 8 + 4] // dst_ptr |
5662 mov esi, [esp + 8 + 8] // src_ptr | 5640 mov esi, [esp + 8 + 8] // src_ptr |
5663 mov edx, [esp + 8 + 12] // src_stride | 5641 mov edx, [esp + 8 + 12] // src_stride |
5664 mov ecx, [esp + 8 + 16] // dst_width | 5642 mov ecx, [esp + 8 + 16] // dst_width |
5665 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | 5643 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
5666 sub edi, esi | 5644 sub edi, esi |
5667 shr eax, 1 | 5645 shr eax, 1 |
5668 // Dispatch to specialized filters if applicable. | 5646 // Dispatch to specialized filters if applicable. |
5669 cmp eax, 0 | 5647 cmp eax, 0 |
5670 je xloop100 // 0 / 128. Blend 100 / 0. | 5648 je xloop100 // 0 / 128. Blend 100 / 0. |
5671 cmp eax, 32 | |
5672 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. | |
5673 cmp eax, 64 | 5649 cmp eax, 64 |
5674 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. | 5650 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
5675 cmp eax, 96 | |
5676 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. | |
5677 | 5651 |
5678 movd xmm0, eax // high fraction 0..127 | 5652 movd xmm0, eax // high fraction 0..127 |
5679 neg eax | 5653 neg eax |
5680 add eax, 128 | 5654 add eax, 128 |
5681 movd xmm5, eax // low fraction 128..1 | 5655 movd xmm5, eax // low fraction 128..1 |
5682 punpcklbw xmm5, xmm0 | 5656 punpcklbw xmm5, xmm0 |
5683 punpcklwd xmm5, xmm5 | 5657 punpcklwd xmm5, xmm5 |
5684 pshufd xmm5, xmm5, 0 | 5658 pshufd xmm5, xmm5, 0 |
5685 | 5659 |
| 5660 mov eax, 0x00400040 // 64 for rounding. |
| 5661 movd xmm4, eax |
| 5662 pshufd xmm4, xmm4, 0x00 |
| 5663 |
5686 xloop: | 5664 xloop: |
5687 movdqu xmm0, [esi] | 5665 movdqu xmm0, [esi] |
5688 movdqu xmm2, [esi + edx] | 5666 movdqu xmm2, [esi + edx] |
5689 movdqu xmm1, xmm0 | 5667 movdqu xmm1, xmm0 |
5690 punpcklbw xmm0, xmm2 | 5668 punpcklbw xmm0, xmm2 |
5691 punpckhbw xmm1, xmm2 | 5669 punpckhbw xmm1, xmm2 |
5692 pmaddubsw xmm0, xmm5 | 5670 pmaddubsw xmm0, xmm5 |
5693 pmaddubsw xmm1, xmm5 | 5671 pmaddubsw xmm1, xmm5 |
| 5672 paddw xmm0, xmm4 |
| 5673 paddw xmm1, xmm4 |
5694 psrlw xmm0, 7 | 5674 psrlw xmm0, 7 |
5695 psrlw xmm1, 7 | 5675 psrlw xmm1, 7 |
5696 packuswb xmm0, xmm1 | 5676 packuswb xmm0, xmm1 |
5697 movdqu [esi + edi], xmm0 | 5677 movdqu [esi + edi], xmm0 |
5698 lea esi, [esi + 16] | 5678 lea esi, [esi + 16] |
5699 sub ecx, 16 | 5679 sub ecx, 16 |
5700 jg xloop | 5680 jg xloop |
5701 jmp xloop99 | 5681 jmp xloop99 |
5702 | 5682 |
5703 // Blend 25 / 75. | |
5704 xloop25: | |
5705 movdqu xmm0, [esi] | |
5706 movdqu xmm1, [esi + edx] | |
5707 pavgb xmm0, xmm1 | |
5708 pavgb xmm0, xmm1 | |
5709 movdqu [esi + edi], xmm0 | |
5710 lea esi, [esi + 16] | |
5711 sub ecx, 16 | |
5712 jg xloop25 | |
5713 jmp xloop99 | |
5714 | |
5715 // Blend 50 / 50. | 5683 // Blend 50 / 50. |
5716 xloop50: | 5684 xloop50: |
5717 movdqu xmm0, [esi] | 5685 movdqu xmm0, [esi] |
5718 movdqu xmm1, [esi + edx] | 5686 movdqu xmm1, [esi + edx] |
5719 pavgb xmm0, xmm1 | 5687 pavgb xmm0, xmm1 |
5720 movdqu [esi + edi], xmm0 | 5688 movdqu [esi + edi], xmm0 |
5721 lea esi, [esi + 16] | 5689 lea esi, [esi + 16] |
5722 sub ecx, 16 | 5690 sub ecx, 16 |
5723 jg xloop50 | 5691 jg xloop50 |
5724 jmp xloop99 | 5692 jmp xloop99 |
5725 | 5693 |
5726 // Blend 75 / 25. | |
5727 xloop75: | |
5728 movdqu xmm1, [esi] | |
5729 movdqu xmm0, [esi + edx] | |
5730 pavgb xmm0, xmm1 | |
5731 pavgb xmm0, xmm1 | |
5732 movdqu [esi + edi], xmm0 | |
5733 lea esi, [esi + 16] | |
5734 sub ecx, 16 | |
5735 jg xloop75 | |
5736 jmp xloop99 | |
5737 | |
5738 // Blend 100 / 0 - Copy row unchanged. | 5694 // Blend 100 / 0 - Copy row unchanged. |
5739 xloop100: | 5695 xloop100: |
5740 movdqu xmm0, [esi] | 5696 movdqu xmm0, [esi] |
5741 movdqu [esi + edi], xmm0 | |
5742 lea esi, [esi + 16] | |
5743 sub ecx, 16 | |
5744 jg xloop100 | |
5745 | |
5746 xloop99: | |
5747 pop edi | |
5748 pop esi | |
5749 ret | |
5750 } | |
5751 } | |
5752 | |
5753 #ifdef HAS_INTERPOLATEROW_SSE2 | |
5754 // Bilinear filter 16x2 -> 16x1 | |
5755 __declspec(naked) | |
5756 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, | |
5757 ptrdiff_t src_stride, int dst_width, | |
5758 int source_y_fraction) { | |
5759 __asm { | |
5760 push esi | |
5761 push edi | |
5762 mov edi, [esp + 8 + 4] // dst_ptr | |
5763 mov esi, [esp + 8 + 8] // src_ptr | |
5764 mov edx, [esp + 8 + 12] // src_stride | |
5765 mov ecx, [esp + 8 + 16] // dst_width | |
5766 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | |
5767 sub edi, esi | |
5768 // Dispatch to specialized filters if applicable. | |
5769 cmp eax, 0 | |
5770 je xloop100 // 0 / 256. Blend 100 / 0. | |
5771 cmp eax, 64 | |
5772 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. | |
5773 cmp eax, 128 | |
5774 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. | |
5775 cmp eax, 192 | |
5776 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. | |
5777 | |
5778 movd xmm5, eax // xmm5 = y fraction | |
5779 punpcklbw xmm5, xmm5 | |
5780 psrlw xmm5, 1 | |
5781 punpcklwd xmm5, xmm5 | |
5782 punpckldq xmm5, xmm5 | |
5783 punpcklqdq xmm5, xmm5 | |
5784 pxor xmm4, xmm4 | |
5785 | |
5786 xloop: | |
5787 movdqu xmm0, [esi] // row0 | |
5788 movdqu xmm2, [esi + edx] // row1 | |
5789 movdqu xmm1, xmm0 | |
5790 movdqu xmm3, xmm2 | |
5791 punpcklbw xmm2, xmm4 | |
5792 punpckhbw xmm3, xmm4 | |
5793 punpcklbw xmm0, xmm4 | |
5794 punpckhbw xmm1, xmm4 | |
5795 psubw xmm2, xmm0 // row1 - row0 | |
5796 psubw xmm3, xmm1 | |
5797 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 | |
5798 paddw xmm3, xmm3 | |
5799 pmulhw xmm2, xmm5 // scale diff | |
5800 pmulhw xmm3, xmm5 | |
5801 paddw xmm0, xmm2 // sum rows | |
5802 paddw xmm1, xmm3 | |
5803 packuswb xmm0, xmm1 | |
5804 movdqu [esi + edi], xmm0 | |
5805 lea esi, [esi + 16] | |
5806 sub ecx, 16 | |
5807 jg xloop | |
5808 jmp xloop99 | |
5809 | |
5810 // Blend 25 / 75. | |
5811 xloop25: | |
5812 movdqu xmm0, [esi] | |
5813 movdqu xmm1, [esi + edx] | |
5814 pavgb xmm0, xmm1 | |
5815 pavgb xmm0, xmm1 | |
5816 movdqu [esi + edi], xmm0 | |
5817 lea esi, [esi + 16] | |
5818 sub ecx, 16 | |
5819 jg xloop25 | |
5820 jmp xloop99 | |
5821 | |
5822 // Blend 50 / 50. | |
5823 xloop50: | |
5824 movdqu xmm0, [esi] | |
5825 movdqu xmm1, [esi + edx] | |
5826 pavgb xmm0, xmm1 | |
5827 movdqu [esi + edi], xmm0 | |
5828 lea esi, [esi + 16] | |
5829 sub ecx, 16 | |
5830 jg xloop50 | |
5831 jmp xloop99 | |
5832 | |
5833 // Blend 75 / 25. | |
5834 xloop75: | |
5835 movdqu xmm1, [esi] | |
5836 movdqu xmm0, [esi + edx] | |
5837 pavgb xmm0, xmm1 | |
5838 pavgb xmm0, xmm1 | |
5839 movdqu [esi + edi], xmm0 | |
5840 lea esi, [esi + 16] | |
5841 sub ecx, 16 | |
5842 jg xloop75 | |
5843 jmp xloop99 | |
5844 | |
5845 // Blend 100 / 0 - Copy row unchanged. | |
5846 xloop100: | |
5847 movdqu xmm0, [esi] | |
5848 movdqu [esi + edi], xmm0 | 5697 movdqu [esi + edi], xmm0 |
5849 lea esi, [esi + 16] | 5698 lea esi, [esi + 16] |
5850 sub ecx, 16 | 5699 sub ecx, 16 |
5851 jg xloop100 | 5700 jg xloop100 |
5852 | 5701 |
5853 xloop99: | 5702 xloop99: |
5854 pop edi | 5703 pop edi |
5855 pop esi | 5704 pop esi |
5856 ret | 5705 ret |
5857 } | 5706 } |
5858 } | 5707 } |
5859 #endif // HAS_INTERPOLATEROW_SSE2 | |
5860 | 5708 |
5861 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5709 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5862 __declspec(naked) | 5710 __declspec(naked) |
5863 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5711 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
5864 const uint8* shuffler, int width) { | 5712 const uint8* shuffler, int width) { |
5865 __asm { | 5713 __asm { |
5866 mov eax, [esp + 4] // src_argb | 5714 mov eax, [esp + 4] // src_argb |
5867 mov edx, [esp + 8] // dst_argb | 5715 mov edx, [esp + 8] // dst_argb |
5868 mov ecx, [esp + 12] // shuffler | 5716 mov ecx, [esp + 12] // shuffler |
5869 movdqu xmm5, [ecx] | 5717 movdqu xmm5, [ecx] |
(...skipping 501 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6371 } | 6219 } |
6372 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6220 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6373 | 6221 |
6374 #endif // defined(_M_X64) | 6222 #endif // defined(_M_X64) |
6375 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6376 | 6224 |
6377 #ifdef __cplusplus | 6225 #ifdef __cplusplus |
6378 } // extern "C" | 6226 } // extern "C" |
6379 } // namespace libyuv | 6227 } // namespace libyuv |
6380 #endif | 6228 #endif |
OLD | NEW |