Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(46)

Side by Side Diff: source/row_win.cc

Issue 1535493002: roll to same version of chromium as head webrtc (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: use var for interpolant on neon armv7 Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_neon64.cc ('k') | source/scale.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 5553 matching lines...) Expand 10 before | Expand all | Expand 10 after
5564 mov edi, [esp + 8 + 4] // dst_ptr 5564 mov edi, [esp + 8 + 4] // dst_ptr
5565 mov esi, [esp + 8 + 8] // src_ptr 5565 mov esi, [esp + 8 + 8] // src_ptr
5566 mov edx, [esp + 8 + 12] // src_stride 5566 mov edx, [esp + 8 + 12] // src_stride
5567 mov ecx, [esp + 8 + 16] // dst_width 5567 mov ecx, [esp + 8 + 16] // dst_width
5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5569 shr eax, 1 5569 shr eax, 1
5570 // Dispatch to specialized filters if applicable. 5570 // Dispatch to specialized filters if applicable.
5571 cmp eax, 0 5571 cmp eax, 0
5572 je xloop100 // 0 / 128. Blend 100 / 0. 5572 je xloop100 // 0 / 128. Blend 100 / 0.
5573 sub edi, esi 5573 sub edi, esi
5574 cmp eax, 32
5575 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
5576 cmp eax, 64 5574 cmp eax, 64
5577 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5575 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
5578 cmp eax, 96
5579 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
5580 5576
5581 vmovd xmm0, eax // high fraction 0..127 5577 vmovd xmm0, eax // high fraction 0..127
5582 neg eax 5578 neg eax
5583 add eax, 128 5579 add eax, 128
5584 vmovd xmm5, eax // low fraction 128..1 5580 vmovd xmm5, eax // low fraction 128..1
5585 vpunpcklbw xmm5, xmm5, xmm0 5581 vpunpcklbw xmm5, xmm5, xmm0
5586 vpunpcklwd xmm5, xmm5, xmm5 5582 vpunpcklwd xmm5, xmm5, xmm5
5587 vpxor ymm0, ymm0, ymm0 5583 vpxor ymm0, ymm0, ymm0
5588 vpermd ymm5, ymm0, ymm5 5584 vpermd ymm5, ymm0, ymm5
5589 5585
5586 mov eax, 0x00400040 // 64 for rounding.
5587 vmovd xmm4, eax
5588 vbroadcastss ymm4, xmm4
5589
5590 xloop: 5590 xloop:
5591 vmovdqu ymm0, [esi] 5591 vmovdqu ymm0, [esi]
5592 vmovdqu ymm2, [esi + edx] 5592 vmovdqu ymm2, [esi + edx]
5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates 5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates 5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates
5595 vpmaddubsw ymm0, ymm0, ymm5 5595 vpmaddubsw ymm0, ymm0, ymm5
5596 vpmaddubsw ymm1, ymm1, ymm5 5596 vpmaddubsw ymm1, ymm1, ymm5
5597 vpaddw ymm0, ymm0, ymm4
5598 vpaddw ymm1, ymm1, ymm4
5597 vpsrlw ymm0, ymm0, 7 5599 vpsrlw ymm0, ymm0, 7
5598 vpsrlw ymm1, ymm1, 7 5600 vpsrlw ymm1, ymm1, 7
5599 vpackuswb ymm0, ymm0, ymm1 // unmutates 5601 vpackuswb ymm0, ymm0, ymm1 // unmutates
5600 vmovdqu [esi + edi], ymm0 5602 vmovdqu [esi + edi], ymm0
5601 lea esi, [esi + 32] 5603 lea esi, [esi + 32]
5602 sub ecx, 32 5604 sub ecx, 32
5603 jg xloop 5605 jg xloop
5604 jmp xloop99 5606 jmp xloop99
5605 5607
5606 // Blend 25 / 75.
5607 xloop25:
5608 vmovdqu ymm0, [esi]
5609 vmovdqu ymm1, [esi + edx]
5610 vpavgb ymm0, ymm0, ymm1
5611 vpavgb ymm0, ymm0, ymm1
5612 vmovdqu [esi + edi], ymm0
5613 lea esi, [esi + 32]
5614 sub ecx, 32
5615 jg xloop25
5616 jmp xloop99
5617
5618 // Blend 50 / 50. 5608 // Blend 50 / 50.
5619 xloop50: 5609 xloop50:
5620 vmovdqu ymm0, [esi] 5610 vmovdqu ymm0, [esi]
5621 vpavgb ymm0, ymm0, [esi + edx] 5611 vpavgb ymm0, ymm0, [esi + edx]
5622 vmovdqu [esi + edi], ymm0 5612 vmovdqu [esi + edi], ymm0
5623 lea esi, [esi + 32] 5613 lea esi, [esi + 32]
5624 sub ecx, 32 5614 sub ecx, 32
5625 jg xloop50 5615 jg xloop50
5626 jmp xloop99 5616 jmp xloop99
5627 5617
5628 // Blend 75 / 25.
5629 xloop75:
5630 vmovdqu ymm1, [esi]
5631 vmovdqu ymm0, [esi + edx]
5632 vpavgb ymm0, ymm0, ymm1
5633 vpavgb ymm0, ymm0, ymm1
5634 vmovdqu [esi + edi], ymm0
5635 lea esi, [esi + 32]
5636 sub ecx, 32
5637 jg xloop75
5638 jmp xloop99
5639
5640 // Blend 100 / 0 - Copy row unchanged. 5618 // Blend 100 / 0 - Copy row unchanged.
5641 xloop100: 5619 xloop100:
5642 rep movsb 5620 rep movsb
5643 5621
5644 xloop99: 5622 xloop99:
5645 pop edi 5623 pop edi
5646 pop esi 5624 pop esi
5647 vzeroupper 5625 vzeroupper
5648 ret 5626 ret
5649 } 5627 }
(...skipping 11 matching lines...) Expand all
5661 mov edi, [esp + 8 + 4] // dst_ptr 5639 mov edi, [esp + 8 + 4] // dst_ptr
5662 mov esi, [esp + 8 + 8] // src_ptr 5640 mov esi, [esp + 8 + 8] // src_ptr
5663 mov edx, [esp + 8 + 12] // src_stride 5641 mov edx, [esp + 8 + 12] // src_stride
5664 mov ecx, [esp + 8 + 16] // dst_width 5642 mov ecx, [esp + 8 + 16] // dst_width
5665 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5643 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5666 sub edi, esi 5644 sub edi, esi
5667 shr eax, 1 5645 shr eax, 1
5668 // Dispatch to specialized filters if applicable. 5646 // Dispatch to specialized filters if applicable.
5669 cmp eax, 0 5647 cmp eax, 0
5670 je xloop100 // 0 / 128. Blend 100 / 0. 5648 je xloop100 // 0 / 128. Blend 100 / 0.
5671 cmp eax, 32
5672 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
5673 cmp eax, 64 5649 cmp eax, 64
5674 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5650 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
5675 cmp eax, 96
5676 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
5677 5651
5678 movd xmm0, eax // high fraction 0..127 5652 movd xmm0, eax // high fraction 0..127
5679 neg eax 5653 neg eax
5680 add eax, 128 5654 add eax, 128
5681 movd xmm5, eax // low fraction 128..1 5655 movd xmm5, eax // low fraction 128..1
5682 punpcklbw xmm5, xmm0 5656 punpcklbw xmm5, xmm0
5683 punpcklwd xmm5, xmm5 5657 punpcklwd xmm5, xmm5
5684 pshufd xmm5, xmm5, 0 5658 pshufd xmm5, xmm5, 0
5685 5659
5660 mov eax, 0x00400040 // 64 for rounding.
5661 movd xmm4, eax
5662 pshufd xmm4, xmm4, 0x00
5663
5686 xloop: 5664 xloop:
5687 movdqu xmm0, [esi] 5665 movdqu xmm0, [esi]
5688 movdqu xmm2, [esi + edx] 5666 movdqu xmm2, [esi + edx]
5689 movdqu xmm1, xmm0 5667 movdqu xmm1, xmm0
5690 punpcklbw xmm0, xmm2 5668 punpcklbw xmm0, xmm2
5691 punpckhbw xmm1, xmm2 5669 punpckhbw xmm1, xmm2
5692 pmaddubsw xmm0, xmm5 5670 pmaddubsw xmm0, xmm5
5693 pmaddubsw xmm1, xmm5 5671 pmaddubsw xmm1, xmm5
5672 paddw xmm0, xmm4
5673 paddw xmm1, xmm4
5694 psrlw xmm0, 7 5674 psrlw xmm0, 7
5695 psrlw xmm1, 7 5675 psrlw xmm1, 7
5696 packuswb xmm0, xmm1 5676 packuswb xmm0, xmm1
5697 movdqu [esi + edi], xmm0 5677 movdqu [esi + edi], xmm0
5698 lea esi, [esi + 16] 5678 lea esi, [esi + 16]
5699 sub ecx, 16 5679 sub ecx, 16
5700 jg xloop 5680 jg xloop
5701 jmp xloop99 5681 jmp xloop99
5702 5682
5703 // Blend 25 / 75.
5704 xloop25:
5705 movdqu xmm0, [esi]
5706 movdqu xmm1, [esi + edx]
5707 pavgb xmm0, xmm1
5708 pavgb xmm0, xmm1
5709 movdqu [esi + edi], xmm0
5710 lea esi, [esi + 16]
5711 sub ecx, 16
5712 jg xloop25
5713 jmp xloop99
5714
5715 // Blend 50 / 50. 5683 // Blend 50 / 50.
5716 xloop50: 5684 xloop50:
5717 movdqu xmm0, [esi] 5685 movdqu xmm0, [esi]
5718 movdqu xmm1, [esi + edx] 5686 movdqu xmm1, [esi + edx]
5719 pavgb xmm0, xmm1 5687 pavgb xmm0, xmm1
5720 movdqu [esi + edi], xmm0 5688 movdqu [esi + edi], xmm0
5721 lea esi, [esi + 16] 5689 lea esi, [esi + 16]
5722 sub ecx, 16 5690 sub ecx, 16
5723 jg xloop50 5691 jg xloop50
5724 jmp xloop99 5692 jmp xloop99
5725 5693
5726 // Blend 75 / 25.
5727 xloop75:
5728 movdqu xmm1, [esi]
5729 movdqu xmm0, [esi + edx]
5730 pavgb xmm0, xmm1
5731 pavgb xmm0, xmm1
5732 movdqu [esi + edi], xmm0
5733 lea esi, [esi + 16]
5734 sub ecx, 16
5735 jg xloop75
5736 jmp xloop99
5737
5738 // Blend 100 / 0 - Copy row unchanged. 5694 // Blend 100 / 0 - Copy row unchanged.
5739 xloop100: 5695 xloop100:
5740 movdqu xmm0, [esi] 5696 movdqu xmm0, [esi]
5741 movdqu [esi + edi], xmm0
5742 lea esi, [esi + 16]
5743 sub ecx, 16
5744 jg xloop100
5745
5746 xloop99:
5747 pop edi
5748 pop esi
5749 ret
5750 }
5751 }
5752
5753 #ifdef HAS_INTERPOLATEROW_SSE2
5754 // Bilinear filter 16x2 -> 16x1
5755 __declspec(naked)
5756 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5757 ptrdiff_t src_stride, int dst_width,
5758 int source_y_fraction) {
5759 __asm {
5760 push esi
5761 push edi
5762 mov edi, [esp + 8 + 4] // dst_ptr
5763 mov esi, [esp + 8 + 8] // src_ptr
5764 mov edx, [esp + 8 + 12] // src_stride
5765 mov ecx, [esp + 8 + 16] // dst_width
5766 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5767 sub edi, esi
5768 // Dispatch to specialized filters if applicable.
5769 cmp eax, 0
5770 je xloop100 // 0 / 256. Blend 100 / 0.
5771 cmp eax, 64
5772 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
5773 cmp eax, 128
5774 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5775 cmp eax, 192
5776 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
5777
5778 movd xmm5, eax // xmm5 = y fraction
5779 punpcklbw xmm5, xmm5
5780 psrlw xmm5, 1
5781 punpcklwd xmm5, xmm5
5782 punpckldq xmm5, xmm5
5783 punpcklqdq xmm5, xmm5
5784 pxor xmm4, xmm4
5785
5786 xloop:
5787 movdqu xmm0, [esi] // row0
5788 movdqu xmm2, [esi + edx] // row1
5789 movdqu xmm1, xmm0
5790 movdqu xmm3, xmm2
5791 punpcklbw xmm2, xmm4
5792 punpckhbw xmm3, xmm4
5793 punpcklbw xmm0, xmm4
5794 punpckhbw xmm1, xmm4
5795 psubw xmm2, xmm0 // row1 - row0
5796 psubw xmm3, xmm1
5797 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
5798 paddw xmm3, xmm3
5799 pmulhw xmm2, xmm5 // scale diff
5800 pmulhw xmm3, xmm5
5801 paddw xmm0, xmm2 // sum rows
5802 paddw xmm1, xmm3
5803 packuswb xmm0, xmm1
5804 movdqu [esi + edi], xmm0
5805 lea esi, [esi + 16]
5806 sub ecx, 16
5807 jg xloop
5808 jmp xloop99
5809
5810 // Blend 25 / 75.
5811 xloop25:
5812 movdqu xmm0, [esi]
5813 movdqu xmm1, [esi + edx]
5814 pavgb xmm0, xmm1
5815 pavgb xmm0, xmm1
5816 movdqu [esi + edi], xmm0
5817 lea esi, [esi + 16]
5818 sub ecx, 16
5819 jg xloop25
5820 jmp xloop99
5821
5822 // Blend 50 / 50.
5823 xloop50:
5824 movdqu xmm0, [esi]
5825 movdqu xmm1, [esi + edx]
5826 pavgb xmm0, xmm1
5827 movdqu [esi + edi], xmm0
5828 lea esi, [esi + 16]
5829 sub ecx, 16
5830 jg xloop50
5831 jmp xloop99
5832
5833 // Blend 75 / 25.
5834 xloop75:
5835 movdqu xmm1, [esi]
5836 movdqu xmm0, [esi + edx]
5837 pavgb xmm0, xmm1
5838 pavgb xmm0, xmm1
5839 movdqu [esi + edi], xmm0
5840 lea esi, [esi + 16]
5841 sub ecx, 16
5842 jg xloop75
5843 jmp xloop99
5844
5845 // Blend 100 / 0 - Copy row unchanged.
5846 xloop100:
5847 movdqu xmm0, [esi]
5848 movdqu [esi + edi], xmm0 5697 movdqu [esi + edi], xmm0
5849 lea esi, [esi + 16] 5698 lea esi, [esi + 16]
5850 sub ecx, 16 5699 sub ecx, 16
5851 jg xloop100 5700 jg xloop100
5852 5701
5853 xloop99: 5702 xloop99:
5854 pop edi 5703 pop edi
5855 pop esi 5704 pop esi
5856 ret 5705 ret
5857 } 5706 }
5858 } 5707 }
5859 #endif // HAS_INTERPOLATEROW_SSE2
5860 5708
5861 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5709 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5862 __declspec(naked) 5710 __declspec(naked)
5863 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5711 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5864 const uint8* shuffler, int width) { 5712 const uint8* shuffler, int width) {
5865 __asm { 5713 __asm {
5866 mov eax, [esp + 4] // src_argb 5714 mov eax, [esp + 4] // src_argb
5867 mov edx, [esp + 8] // dst_argb 5715 mov edx, [esp + 8] // dst_argb
5868 mov ecx, [esp + 12] // shuffler 5716 mov ecx, [esp + 12] // shuffler
5869 movdqu xmm5, [ecx] 5717 movdqu xmm5, [ecx]
(...skipping 501 matching lines...) Expand 10 before | Expand all | Expand 10 after
6371 } 6219 }
6372 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6220 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6373 6221
6374 #endif // defined(_M_X64) 6222 #endif // defined(_M_X64)
6375 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6376 6224
6377 #ifdef __cplusplus 6225 #ifdef __cplusplus
6378 } // extern "C" 6226 } // extern "C"
6379 } // namespace libyuv 6227 } // namespace libyuv
6380 #endif 6228 #endif
OLDNEW
« no previous file with comments | « source/row_neon64.cc ('k') | source/scale.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698