source/row_win.cc - Issue 1535833003: avx2 interpolate use 8 bit

Side by Side Diff: source/row_win.cc

Issue 1535833003: avx2 interpolate use 8 bit (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: gcc version of interpolate Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 5548 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5559 ptrdiff_t src_stride, int dst_width,	5559 ptrdiff_t src_stride, int dst_width,

5560 int source_y_fraction) {	5560 int source_y_fraction) {

5561 __asm {	5561 __asm {

5562 push esi	5562 push esi

5563 push edi	5563 push edi

5564 mov edi, [esp + 8 + 4] // dst_ptr	5564 mov edi, [esp + 8 + 4] // dst_ptr

5565 mov esi, [esp + 8 + 8] // src_ptr	5565 mov esi, [esp + 8 + 8] // src_ptr

5566 mov edx, [esp + 8 + 12] // src_stride	5566 mov edx, [esp + 8 + 12] // src_stride

5567 mov ecx, [esp + 8 + 16] // dst_width	5567 mov ecx, [esp + 8 + 16] // dst_width

5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)	5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)

5569 shr eax, 1

5570 // Dispatch to specialized filters if applicable.	5569 // Dispatch to specialized filters if applicable.

5571 cmp eax, 0	5570 cmp eax, 0

5572 je xloop100 // 0 / 128. Blend 100 / 0.	5571 je xloop100 // 0 / 256. Blend 100 / 0.

5573 sub edi, esi	5572 sub edi, esi

5574 cmp eax, 64	5573 cmp eax, 128

5575 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.	5574 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.

5576	5575

5577 vmovd xmm0, eax // high fraction 0..127	5576 vmovd xmm0, eax // high fraction 0..255

5578 neg eax	5577 neg eax

5579 add eax, 128	5578 add eax, 256

5580 vmovd xmm5, eax // low fraction 128..1	5579 vmovd xmm5, eax // low fraction 256..1

5581 vpunpcklbw xmm5, xmm5, xmm0	5580 vpunpcklbw xmm5, xmm5, xmm0

5582 vpunpcklwd xmm5, xmm5, xmm5	5581 vpunpcklwd xmm5, xmm5, xmm5

5583 vpxor ymm0, ymm0, ymm0	5582 vbroadcastss ymm5, xmm5

5584 vpermd ymm5, ymm0, ymm5

5585	5583

5586 mov eax, 0x00400040 // 64 for rounding.	5584 mov eax, 0x80808080 // 128b for bias and rounding.

5587 vmovd xmm4, eax	5585 vmovd xmm4, eax

5588 vbroadcastss ymm4, xmm4	5586 vbroadcastss ymm4, xmm4

5589	5587

5590 xloop:	5588 xloop:

5591 vmovdqu ymm0, [esi]	5589 vmovdqu ymm0, [esi]

5592 vmovdqu ymm2, [esi + edx]	5590 vmovdqu ymm2, [esi + edx]

5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates	5591 vpunpckhbw ymm1, ymm0, ymm2 // mutates

5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates	5592 vpunpcklbw ymm0, ymm0, ymm2

5595 vpmaddubsw ymm0, ymm0, ymm5	5593 vpsubb ymm1, ymm1, ymm4 // bias to signed image

5596 vpmaddubsw ymm1, ymm1, ymm5	5594 vpsubb ymm0, ymm0, ymm4

	5595 vpmaddubsw ymm1, ymm5, ymm1

	5596 vpmaddubsw ymm0, ymm5, ymm0

	5597 vpaddw ymm1, ymm1, ymm4 // unbias and round

5597 vpaddw ymm0, ymm0, ymm4	5598 vpaddw ymm0, ymm0, ymm4

5598 vpaddw ymm1, ymm1, ymm4	5599 vpsrlw ymm1, ymm1, 8

5599 vpsrlw ymm0, ymm0, 7	5600 vpsrlw ymm0, ymm0, 8

5600 vpsrlw ymm1, ymm1, 7

5601 vpackuswb ymm0, ymm0, ymm1 // unmutates	5601 vpackuswb ymm0, ymm0, ymm1 // unmutates

5602 vmovdqu [esi + edi], ymm0	5602 vmovdqu [esi + edi], ymm0

5603 lea esi, [esi + 32]	5603 lea esi, [esi + 32]

5604 sub ecx, 32	5604 sub ecx, 32

5605 jg xloop	5605 jg xloop

5606 jmp xloop99	5606 jmp xloop99

5607	5607

5608 // Blend 50 / 50.	5608 // Blend 50 / 50.

5609 xloop50:	5609 xloop50:

5610 vmovdqu ymm0, [esi]	5610 vmovdqu ymm0, [esi]

(...skipping 11 matching lines...) Expand all Loading...
5622 xloop99:	5622 xloop99:

5623 pop edi	5623 pop edi

5624 pop esi	5624 pop esi

5625 vzeroupper	5625 vzeroupper

5626 ret	5626 ret

5627 }	5627 }

5628 }	5628 }

5629 #endif // HAS_INTERPOLATEROW_AVX2	5629 #endif // HAS_INTERPOLATEROW_AVX2

5630	5630

5631 // Bilinear filter 16x2 -> 16x1	5631 // Bilinear filter 16x2 -> 16x1

	5632 // TODO(fbarchard): Consider allowing 256 using memcpy.

5632 __declspec(naked)	5633 __declspec(naked)

5633 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,	5634 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

5634 ptrdiff_t src_stride, int dst_width,	5635 ptrdiff_t src_stride, int dst_width,

5635 int source_y_fraction) {	5636 int source_y_fraction) {

5636 __asm {	5637 __asm {

5637 push esi	5638 push esi

5638 push edi	5639 push edi

	5640

5639 mov edi, [esp + 8 + 4] // dst_ptr	5641 mov edi, [esp + 8 + 4] // dst_ptr

5640 mov esi, [esp + 8 + 8] // src_ptr	5642 mov esi, [esp + 8 + 8] // src_ptr

5641 mov edx, [esp + 8 + 12] // src_stride	5643 mov edx, [esp + 8 + 12] // src_stride

5642 mov ecx, [esp + 8 + 16] // dst_width	5644 mov ecx, [esp + 8 + 16] // dst_width

5643 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)	5645 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)

5644 sub edi, esi	5646 sub edi, esi

5645 shr eax, 1

5646 // Dispatch to specialized filters if applicable.	5647 // Dispatch to specialized filters if applicable.

5647 cmp eax, 0	5648 cmp eax, 0

5648 je xloop100 // 0 / 128. Blend 100 / 0.	5649 je xloop100 // 0 /256. Blend 100 / 0.

5649 cmp eax, 64	5650 cmp eax, 128

5650 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.	5651 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.

5651	5652

5652 movd xmm0, eax // high fraction 0..127	5653 movd xmm0, eax // high fraction 0..255

5653 neg eax	5654 neg eax

5654 add eax, 128	5655 add eax, 256

5655 movd xmm5, eax // low fraction 128..1	5656 movd xmm5, eax // low fraction 255..1

5656 punpcklbw xmm5, xmm0	5657 punpcklbw xmm5, xmm0

5657 punpcklwd xmm5, xmm5	5658 punpcklwd xmm5, xmm5

5658 pshufd xmm5, xmm5, 0	5659 pshufd xmm5, xmm5, 0

5659	5660 mov eax, 0x80808080 // 128 for biasing image to signed.

5660 mov eax, 0x00400040 // 64 for rounding.

5661 movd xmm4, eax	5661 movd xmm4, eax

5662 pshufd xmm4, xmm4, 0x00	5662 pshufd xmm4, xmm4, 0x00

5663	5663

5664 xloop:	5664 xloop:

5665 movdqu xmm0, [esi]	5665 movdqu xmm0, [esi]

5666 movdqu xmm2, [esi + edx]	5666 movdqu xmm2, [esi + edx]

5667 movdqu xmm1, xmm0	5667 movdqu xmm1, xmm0

5668 punpcklbw xmm0, xmm2	5668 punpcklbw xmm0, xmm2

5669 punpckhbw xmm1, xmm2	5669 punpckhbw xmm1, xmm2

5670 pmaddubsw xmm0, xmm5	5670 psubb xmm0, xmm4 // bias image by -128

5671 pmaddubsw xmm1, xmm5	5671 psubb xmm1, xmm4

5672 paddw xmm0, xmm4	5672 movdqa xmm2, xmm5

5673 paddw xmm1, xmm4	5673 movdqa xmm3, xmm5

5674 psrlw xmm0, 7	5674 pmaddubsw xmm2, xmm0

5675 psrlw xmm1, 7	5675 pmaddubsw xmm3, xmm1

5676 packuswb xmm0, xmm1	5676 paddw xmm2, xmm4

5677 movdqu [esi + edi], xmm0	5677 paddw xmm3, xmm4

	5678 psrlw xmm2, 8

	5679 psrlw xmm3, 8

	5680 packuswb xmm2, xmm3

	5681 movdqu [esi + edi], xmm2

5678 lea esi, [esi + 16]	5682 lea esi, [esi + 16]

5679 sub ecx, 16	5683 sub ecx, 16

5680 jg xloop	5684 jg xloop

5681 jmp xloop99	5685 jmp xloop99

5682	5686

5683 // Blend 50 / 50.	5687 // Blend 50 / 50.

5684 xloop50:	5688 xloop50:

5685 movdqu xmm0, [esi]	5689 movdqu xmm0, [esi]

5686 movdqu xmm1, [esi + edx]	5690 movdqu xmm1, [esi + edx]

5687 pavgb xmm0, xmm1	5691 pavgb xmm0, xmm1

(...skipping 531 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6219 }	6223 }

6220 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	6224 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

6221	6225

6222 #endif // defined(_M_X64)	6226 #endif // defined(_M_X64)

6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))	6227 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) \|\| defined(_M_X64))

6224	6228

6225 #ifdef __cplusplus	6229 #ifdef __cplusplus

6226 } // extern "C"	6230 } // extern "C"

6227 } // namespace libyuv	6231 } // namespace libyuv

6228 #endif	6232 #endif

OLD	NEW

« no previous file with comments | « source/row_neon64.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »