Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(31)

Side by Side Diff: source/row_win.cc

Issue 1535833003: avx2 interpolate use 8 bit (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: gcc version of interpolate Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_neon64.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 5548 matching lines...) Expand 10 before | Expand all | Expand 10 after
5559 ptrdiff_t src_stride, int dst_width, 5559 ptrdiff_t src_stride, int dst_width,
5560 int source_y_fraction) { 5560 int source_y_fraction) {
5561 __asm { 5561 __asm {
5562 push esi 5562 push esi
5563 push edi 5563 push edi
5564 mov edi, [esp + 8 + 4] // dst_ptr 5564 mov edi, [esp + 8 + 4] // dst_ptr
5565 mov esi, [esp + 8 + 8] // src_ptr 5565 mov esi, [esp + 8 + 8] // src_ptr
5566 mov edx, [esp + 8 + 12] // src_stride 5566 mov edx, [esp + 8 + 12] // src_stride
5567 mov ecx, [esp + 8 + 16] // dst_width 5567 mov ecx, [esp + 8 + 16] // dst_width
5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5569 shr eax, 1
5570 // Dispatch to specialized filters if applicable. 5569 // Dispatch to specialized filters if applicable.
5571 cmp eax, 0 5570 cmp eax, 0
5572 je xloop100 // 0 / 128. Blend 100 / 0. 5571 je xloop100 // 0 / 256. Blend 100 / 0.
5573 sub edi, esi 5572 sub edi, esi
5574 cmp eax, 64 5573 cmp eax, 128
5575 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5574 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
5576 5575
5577 vmovd xmm0, eax // high fraction 0..127 5576 vmovd xmm0, eax // high fraction 0..255
5578 neg eax 5577 neg eax
5579 add eax, 128 5578 add eax, 256
5580 vmovd xmm5, eax // low fraction 128..1 5579 vmovd xmm5, eax // low fraction 256..1
5581 vpunpcklbw xmm5, xmm5, xmm0 5580 vpunpcklbw xmm5, xmm5, xmm0
5582 vpunpcklwd xmm5, xmm5, xmm5 5581 vpunpcklwd xmm5, xmm5, xmm5
5583 vpxor ymm0, ymm0, ymm0 5582 vbroadcastss ymm5, xmm5
5584 vpermd ymm5, ymm0, ymm5
5585 5583
5586 mov eax, 0x00400040 // 64 for rounding. 5584 mov eax, 0x80808080 // 128b for bias and rounding.
5587 vmovd xmm4, eax 5585 vmovd xmm4, eax
5588 vbroadcastss ymm4, xmm4 5586 vbroadcastss ymm4, xmm4
5589 5587
5590 xloop: 5588 xloop:
5591 vmovdqu ymm0, [esi] 5589 vmovdqu ymm0, [esi]
5592 vmovdqu ymm2, [esi + edx] 5590 vmovdqu ymm2, [esi + edx]
5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates 5591 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates 5592 vpunpcklbw ymm0, ymm0, ymm2
5595 vpmaddubsw ymm0, ymm0, ymm5 5593 vpsubb ymm1, ymm1, ymm4 // bias to signed image
5596 vpmaddubsw ymm1, ymm1, ymm5 5594 vpsubb ymm0, ymm0, ymm4
5595 vpmaddubsw ymm1, ymm5, ymm1
5596 vpmaddubsw ymm0, ymm5, ymm0
5597 vpaddw ymm1, ymm1, ymm4 // unbias and round
5597 vpaddw ymm0, ymm0, ymm4 5598 vpaddw ymm0, ymm0, ymm4
5598 vpaddw ymm1, ymm1, ymm4 5599 vpsrlw ymm1, ymm1, 8
5599 vpsrlw ymm0, ymm0, 7 5600 vpsrlw ymm0, ymm0, 8
5600 vpsrlw ymm1, ymm1, 7
5601 vpackuswb ymm0, ymm0, ymm1 // unmutates 5601 vpackuswb ymm0, ymm0, ymm1 // unmutates
5602 vmovdqu [esi + edi], ymm0 5602 vmovdqu [esi + edi], ymm0
5603 lea esi, [esi + 32] 5603 lea esi, [esi + 32]
5604 sub ecx, 32 5604 sub ecx, 32
5605 jg xloop 5605 jg xloop
5606 jmp xloop99 5606 jmp xloop99
5607 5607
5608 // Blend 50 / 50. 5608 // Blend 50 / 50.
5609 xloop50: 5609 xloop50:
5610 vmovdqu ymm0, [esi] 5610 vmovdqu ymm0, [esi]
(...skipping 11 matching lines...) Expand all
5622 xloop99: 5622 xloop99:
5623 pop edi 5623 pop edi
5624 pop esi 5624 pop esi
5625 vzeroupper 5625 vzeroupper
5626 ret 5626 ret
5627 } 5627 }
5628 } 5628 }
5629 #endif // HAS_INTERPOLATEROW_AVX2 5629 #endif // HAS_INTERPOLATEROW_AVX2
5630 5630
5631 // Bilinear filter 16x2 -> 16x1 5631 // Bilinear filter 16x2 -> 16x1
5632 // TODO(fbarchard): Consider allowing 256 using memcpy.
5632 __declspec(naked) 5633 __declspec(naked)
5633 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 5634 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5634 ptrdiff_t src_stride, int dst_width, 5635 ptrdiff_t src_stride, int dst_width,
5635 int source_y_fraction) { 5636 int source_y_fraction) {
5636 __asm { 5637 __asm {
5637 push esi 5638 push esi
5638 push edi 5639 push edi
5640
5639 mov edi, [esp + 8 + 4] // dst_ptr 5641 mov edi, [esp + 8 + 4] // dst_ptr
5640 mov esi, [esp + 8 + 8] // src_ptr 5642 mov esi, [esp + 8 + 8] // src_ptr
5641 mov edx, [esp + 8 + 12] // src_stride 5643 mov edx, [esp + 8 + 12] // src_stride
5642 mov ecx, [esp + 8 + 16] // dst_width 5644 mov ecx, [esp + 8 + 16] // dst_width
5643 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5645 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5644 sub edi, esi 5646 sub edi, esi
5645 shr eax, 1
5646 // Dispatch to specialized filters if applicable. 5647 // Dispatch to specialized filters if applicable.
5647 cmp eax, 0 5648 cmp eax, 0
5648 je xloop100 // 0 / 128. Blend 100 / 0. 5649 je xloop100 // 0 /256. Blend 100 / 0.
5649 cmp eax, 64 5650 cmp eax, 128
5650 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5651 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5651 5652
5652 movd xmm0, eax // high fraction 0..127 5653 movd xmm0, eax // high fraction 0..255
5653 neg eax 5654 neg eax
5654 add eax, 128 5655 add eax, 256
5655 movd xmm5, eax // low fraction 128..1 5656 movd xmm5, eax // low fraction 255..1
5656 punpcklbw xmm5, xmm0 5657 punpcklbw xmm5, xmm0
5657 punpcklwd xmm5, xmm5 5658 punpcklwd xmm5, xmm5
5658 pshufd xmm5, xmm5, 0 5659 pshufd xmm5, xmm5, 0
5659 5660 mov eax, 0x80808080 // 128 for biasing image to signed.
5660 mov eax, 0x00400040 // 64 for rounding.
5661 movd xmm4, eax 5661 movd xmm4, eax
5662 pshufd xmm4, xmm4, 0x00 5662 pshufd xmm4, xmm4, 0x00
5663 5663
5664 xloop: 5664 xloop:
5665 movdqu xmm0, [esi] 5665 movdqu xmm0, [esi]
5666 movdqu xmm2, [esi + edx] 5666 movdqu xmm2, [esi + edx]
5667 movdqu xmm1, xmm0 5667 movdqu xmm1, xmm0
5668 punpcklbw xmm0, xmm2 5668 punpcklbw xmm0, xmm2
5669 punpckhbw xmm1, xmm2 5669 punpckhbw xmm1, xmm2
5670 pmaddubsw xmm0, xmm5 5670 psubb xmm0, xmm4 // bias image by -128
5671 pmaddubsw xmm1, xmm5 5671 psubb xmm1, xmm4
5672 paddw xmm0, xmm4 5672 movdqa xmm2, xmm5
5673 paddw xmm1, xmm4 5673 movdqa xmm3, xmm5
5674 psrlw xmm0, 7 5674 pmaddubsw xmm2, xmm0
5675 psrlw xmm1, 7 5675 pmaddubsw xmm3, xmm1
5676 packuswb xmm0, xmm1 5676 paddw xmm2, xmm4
5677 movdqu [esi + edi], xmm0 5677 paddw xmm3, xmm4
5678 psrlw xmm2, 8
5679 psrlw xmm3, 8
5680 packuswb xmm2, xmm3
5681 movdqu [esi + edi], xmm2
5678 lea esi, [esi + 16] 5682 lea esi, [esi + 16]
5679 sub ecx, 16 5683 sub ecx, 16
5680 jg xloop 5684 jg xloop
5681 jmp xloop99 5685 jmp xloop99
5682 5686
5683 // Blend 50 / 50. 5687 // Blend 50 / 50.
5684 xloop50: 5688 xloop50:
5685 movdqu xmm0, [esi] 5689 movdqu xmm0, [esi]
5686 movdqu xmm1, [esi + edx] 5690 movdqu xmm1, [esi + edx]
5687 pavgb xmm0, xmm1 5691 pavgb xmm0, xmm1
(...skipping 531 matching lines...) Expand 10 before | Expand all | Expand 10 after
6219 } 6223 }
6220 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6224 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6221 6225
6222 #endif // defined(_M_X64) 6226 #endif // defined(_M_X64)
6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6227 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6224 6228
6225 #ifdef __cplusplus 6229 #ifdef __cplusplus
6226 } // extern "C" 6230 } // extern "C"
6227 } // namespace libyuv 6231 } // namespace libyuv
6228 #endif 6232 #endif
OLDNEW
« no previous file with comments | « source/row_neon64.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698