OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 5548 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5559 ptrdiff_t src_stride, int dst_width, | 5559 ptrdiff_t src_stride, int dst_width, |
5560 int source_y_fraction) { | 5560 int source_y_fraction) { |
5561 __asm { | 5561 __asm { |
5562 push esi | 5562 push esi |
5563 push edi | 5563 push edi |
5564 mov edi, [esp + 8 + 4] // dst_ptr | 5564 mov edi, [esp + 8 + 4] // dst_ptr |
5565 mov esi, [esp + 8 + 8] // src_ptr | 5565 mov esi, [esp + 8 + 8] // src_ptr |
5566 mov edx, [esp + 8 + 12] // src_stride | 5566 mov edx, [esp + 8 + 12] // src_stride |
5567 mov ecx, [esp + 8 + 16] // dst_width | 5567 mov ecx, [esp + 8 + 16] // dst_width |
5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | 5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
5569 shr eax, 1 | |
5570 // Dispatch to specialized filters if applicable. | 5569 // Dispatch to specialized filters if applicable. |
5571 cmp eax, 0 | 5570 cmp eax, 0 |
5572 je xloop100 // 0 / 128. Blend 100 / 0. | 5571 je xloop100 // 0 / 256. Blend 100 / 0. |
5573 sub edi, esi | 5572 sub edi, esi |
5574 cmp eax, 64 | 5573 cmp eax, 128 |
5575 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. | 5574 je xloop50 // 128 /256 is 0.50. Blend 50 / 50. |
5576 | 5575 |
5577 vmovd xmm0, eax // high fraction 0..127 | 5576 vmovd xmm0, eax // high fraction 0..255 |
5578 neg eax | 5577 neg eax |
5579 add eax, 128 | 5578 add eax, 256 |
5580 vmovd xmm5, eax // low fraction 128..1 | 5579 vmovd xmm5, eax // low fraction 256..1 |
5581 vpunpcklbw xmm5, xmm5, xmm0 | 5580 vpunpcklbw xmm5, xmm5, xmm0 |
5582 vpunpcklwd xmm5, xmm5, xmm5 | 5581 vpunpcklwd xmm5, xmm5, xmm5 |
5583 vpxor ymm0, ymm0, ymm0 | 5582 vbroadcastss ymm5, xmm5 |
5584 vpermd ymm5, ymm0, ymm5 | |
5585 | 5583 |
5586 mov eax, 0x00400040 // 64 for rounding. | 5584 mov eax, 0x80808080 // 128b for bias and rounding. |
5587 vmovd xmm4, eax | 5585 vmovd xmm4, eax |
5588 vbroadcastss ymm4, xmm4 | 5586 vbroadcastss ymm4, xmm4 |
5589 | 5587 |
5590 xloop: | 5588 xloop: |
5591 vmovdqu ymm0, [esi] | 5589 vmovdqu ymm0, [esi] |
5592 vmovdqu ymm2, [esi + edx] | 5590 vmovdqu ymm2, [esi + edx] |
5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates | 5591 vpunpckhbw ymm1, ymm0, ymm2 // mutates |
5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates | 5592 vpunpcklbw ymm0, ymm0, ymm2 |
5595 vpmaddubsw ymm0, ymm0, ymm5 | 5593 vpsubb ymm1, ymm1, ymm4 // bias to signed image |
5596 vpmaddubsw ymm1, ymm1, ymm5 | 5594 vpsubb ymm0, ymm0, ymm4 |
| 5595 vpmaddubsw ymm1, ymm5, ymm1 |
| 5596 vpmaddubsw ymm0, ymm5, ymm0 |
| 5597 vpaddw ymm1, ymm1, ymm4 // unbias and round |
5597 vpaddw ymm0, ymm0, ymm4 | 5598 vpaddw ymm0, ymm0, ymm4 |
5598 vpaddw ymm1, ymm1, ymm4 | 5599 vpsrlw ymm1, ymm1, 8 |
5599 vpsrlw ymm0, ymm0, 7 | 5600 vpsrlw ymm0, ymm0, 8 |
5600 vpsrlw ymm1, ymm1, 7 | |
5601 vpackuswb ymm0, ymm0, ymm1 // unmutates | 5601 vpackuswb ymm0, ymm0, ymm1 // unmutates |
5602 vmovdqu [esi + edi], ymm0 | 5602 vmovdqu [esi + edi], ymm0 |
5603 lea esi, [esi + 32] | 5603 lea esi, [esi + 32] |
5604 sub ecx, 32 | 5604 sub ecx, 32 |
5605 jg xloop | 5605 jg xloop |
5606 jmp xloop99 | 5606 jmp xloop99 |
5607 | 5607 |
5608 // Blend 50 / 50. | 5608 // Blend 50 / 50. |
5609 xloop50: | 5609 xloop50: |
5610 vmovdqu ymm0, [esi] | 5610 vmovdqu ymm0, [esi] |
(...skipping 11 matching lines...) Expand all Loading... |
5622 xloop99: | 5622 xloop99: |
5623 pop edi | 5623 pop edi |
5624 pop esi | 5624 pop esi |
5625 vzeroupper | 5625 vzeroupper |
5626 ret | 5626 ret |
5627 } | 5627 } |
5628 } | 5628 } |
5629 #endif // HAS_INTERPOLATEROW_AVX2 | 5629 #endif // HAS_INTERPOLATEROW_AVX2 |
5630 | 5630 |
5631 // Bilinear filter 16x2 -> 16x1 | 5631 // Bilinear filter 16x2 -> 16x1 |
| 5632 // TODO(fbarchard): Consider allowing 256 using memcpy. |
5632 __declspec(naked) | 5633 __declspec(naked) |
5633 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 5634 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
5634 ptrdiff_t src_stride, int dst_width, | 5635 ptrdiff_t src_stride, int dst_width, |
5635 int source_y_fraction) { | 5636 int source_y_fraction) { |
5636 __asm { | 5637 __asm { |
5637 push esi | 5638 push esi |
5638 push edi | 5639 push edi |
| 5640 |
5639 mov edi, [esp + 8 + 4] // dst_ptr | 5641 mov edi, [esp + 8 + 4] // dst_ptr |
5640 mov esi, [esp + 8 + 8] // src_ptr | 5642 mov esi, [esp + 8 + 8] // src_ptr |
5641 mov edx, [esp + 8 + 12] // src_stride | 5643 mov edx, [esp + 8 + 12] // src_stride |
5642 mov ecx, [esp + 8 + 16] // dst_width | 5644 mov ecx, [esp + 8 + 16] // dst_width |
5643 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | 5645 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
5644 sub edi, esi | 5646 sub edi, esi |
5645 shr eax, 1 | |
5646 // Dispatch to specialized filters if applicable. | 5647 // Dispatch to specialized filters if applicable. |
5647 cmp eax, 0 | 5648 cmp eax, 0 |
5648 je xloop100 // 0 / 128. Blend 100 / 0. | 5649 je xloop100 // 0 /256. Blend 100 / 0. |
5649 cmp eax, 64 | 5650 cmp eax, 128 |
5650 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. | 5651 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
5651 | 5652 |
5652 movd xmm0, eax // high fraction 0..127 | 5653 movd xmm0, eax // high fraction 0..255 |
5653 neg eax | 5654 neg eax |
5654 add eax, 128 | 5655 add eax, 256 |
5655 movd xmm5, eax // low fraction 128..1 | 5656 movd xmm5, eax // low fraction 255..1 |
5656 punpcklbw xmm5, xmm0 | 5657 punpcklbw xmm5, xmm0 |
5657 punpcklwd xmm5, xmm5 | 5658 punpcklwd xmm5, xmm5 |
5658 pshufd xmm5, xmm5, 0 | 5659 pshufd xmm5, xmm5, 0 |
5659 | 5660 mov eax, 0x80808080 // 128 for biasing image to signed. |
5660 mov eax, 0x00400040 // 64 for rounding. | |
5661 movd xmm4, eax | 5661 movd xmm4, eax |
5662 pshufd xmm4, xmm4, 0x00 | 5662 pshufd xmm4, xmm4, 0x00 |
5663 | 5663 |
5664 xloop: | 5664 xloop: |
5665 movdqu xmm0, [esi] | 5665 movdqu xmm0, [esi] |
5666 movdqu xmm2, [esi + edx] | 5666 movdqu xmm2, [esi + edx] |
5667 movdqu xmm1, xmm0 | 5667 movdqu xmm1, xmm0 |
5668 punpcklbw xmm0, xmm2 | 5668 punpcklbw xmm0, xmm2 |
5669 punpckhbw xmm1, xmm2 | 5669 punpckhbw xmm1, xmm2 |
5670 pmaddubsw xmm0, xmm5 | 5670 psubb xmm0, xmm4 // bias image by -128 |
5671 pmaddubsw xmm1, xmm5 | 5671 psubb xmm1, xmm4 |
5672 paddw xmm0, xmm4 | 5672 movdqa xmm2, xmm5 |
5673 paddw xmm1, xmm4 | 5673 movdqa xmm3, xmm5 |
5674 psrlw xmm0, 7 | 5674 pmaddubsw xmm2, xmm0 |
5675 psrlw xmm1, 7 | 5675 pmaddubsw xmm3, xmm1 |
5676 packuswb xmm0, xmm1 | 5676 paddw xmm2, xmm4 |
5677 movdqu [esi + edi], xmm0 | 5677 paddw xmm3, xmm4 |
| 5678 psrlw xmm2, 8 |
| 5679 psrlw xmm3, 8 |
| 5680 packuswb xmm2, xmm3 |
| 5681 movdqu [esi + edi], xmm2 |
5678 lea esi, [esi + 16] | 5682 lea esi, [esi + 16] |
5679 sub ecx, 16 | 5683 sub ecx, 16 |
5680 jg xloop | 5684 jg xloop |
5681 jmp xloop99 | 5685 jmp xloop99 |
5682 | 5686 |
5683 // Blend 50 / 50. | 5687 // Blend 50 / 50. |
5684 xloop50: | 5688 xloop50: |
5685 movdqu xmm0, [esi] | 5689 movdqu xmm0, [esi] |
5686 movdqu xmm1, [esi + edx] | 5690 movdqu xmm1, [esi + edx] |
5687 pavgb xmm0, xmm1 | 5691 pavgb xmm0, xmm1 |
(...skipping 531 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6219 } | 6223 } |
6220 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6224 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6221 | 6225 |
6222 #endif // defined(_M_X64) | 6226 #endif // defined(_M_X64) |
6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6227 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6224 | 6228 |
6225 #ifdef __cplusplus | 6229 #ifdef __cplusplus |
6226 } // extern "C" | 6230 } // extern "C" |
6227 } // namespace libyuv | 6231 } // namespace libyuv |
6228 #endif | 6232 #endif |
OLD | NEW |