| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 5548 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5559 ptrdiff_t src_stride, int dst_width, | 5559 ptrdiff_t src_stride, int dst_width, |
| 5560 int source_y_fraction) { | 5560 int source_y_fraction) { |
| 5561 __asm { | 5561 __asm { |
| 5562 push esi | 5562 push esi |
| 5563 push edi | 5563 push edi |
| 5564 mov edi, [esp + 8 + 4] // dst_ptr | 5564 mov edi, [esp + 8 + 4] // dst_ptr |
| 5565 mov esi, [esp + 8 + 8] // src_ptr | 5565 mov esi, [esp + 8 + 8] // src_ptr |
| 5566 mov edx, [esp + 8 + 12] // src_stride | 5566 mov edx, [esp + 8 + 12] // src_stride |
| 5567 mov ecx, [esp + 8 + 16] // dst_width | 5567 mov ecx, [esp + 8 + 16] // dst_width |
| 5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | 5568 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 5569 shr eax, 1 | |
| 5570 // Dispatch to specialized filters if applicable. | 5569 // Dispatch to specialized filters if applicable. |
| 5571 cmp eax, 0 | 5570 cmp eax, 0 |
| 5572 je xloop100 // 0 / 128. Blend 100 / 0. | 5571 je xloop100 // 0 / 256. Blend 100 / 0. |
| 5573 sub edi, esi | 5572 sub edi, esi |
| 5574 cmp eax, 64 | 5573 cmp eax, 128 |
| 5575 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. | 5574 je xloop50 // 128 /256 is 0.50. Blend 50 / 50. |
| 5576 | 5575 |
| 5577 vmovd xmm0, eax // high fraction 0..127 | 5576 vmovd xmm0, eax // high fraction 0..255 |
| 5578 neg eax | 5577 neg eax |
| 5579 add eax, 128 | 5578 add eax, 256 |
| 5580 vmovd xmm5, eax // low fraction 128..1 | 5579 vmovd xmm5, eax // low fraction 256..1 |
| 5581 vpunpcklbw xmm5, xmm5, xmm0 | 5580 vpunpcklbw xmm5, xmm5, xmm0 |
| 5582 vpunpcklwd xmm5, xmm5, xmm5 | 5581 vpunpcklwd xmm5, xmm5, xmm5 |
| 5583 vpxor ymm0, ymm0, ymm0 | 5582 vbroadcastss ymm5, xmm5 |
| 5584 vpermd ymm5, ymm0, ymm5 | |
| 5585 | 5583 |
| 5586 mov eax, 0x00400040 // 64 for rounding. | 5584 mov eax, 0x80808080 // 128b for bias and rounding. |
| 5587 vmovd xmm4, eax | 5585 vmovd xmm4, eax |
| 5588 vbroadcastss ymm4, xmm4 | 5586 vbroadcastss ymm4, xmm4 |
| 5589 | 5587 |
| 5590 xloop: | 5588 xloop: |
| 5591 vmovdqu ymm0, [esi] | 5589 vmovdqu ymm0, [esi] |
| 5592 vmovdqu ymm2, [esi + edx] | 5590 vmovdqu ymm2, [esi + edx] |
| 5593 vpunpckhbw ymm1, ymm0, ymm2 // mutates | 5591 vpunpckhbw ymm1, ymm0, ymm2 // mutates |
| 5594 vpunpcklbw ymm0, ymm0, ymm2 // mutates | 5592 vpunpcklbw ymm0, ymm0, ymm2 |
| 5595 vpmaddubsw ymm0, ymm0, ymm5 | 5593 vpsubb ymm1, ymm1, ymm4 // bias to signed image |
| 5596 vpmaddubsw ymm1, ymm1, ymm5 | 5594 vpsubb ymm0, ymm0, ymm4 |
| 5595 vpmaddubsw ymm1, ymm5, ymm1 |
| 5596 vpmaddubsw ymm0, ymm5, ymm0 |
| 5597 vpaddw ymm1, ymm1, ymm4 // unbias and round |
| 5597 vpaddw ymm0, ymm0, ymm4 | 5598 vpaddw ymm0, ymm0, ymm4 |
| 5598 vpaddw ymm1, ymm1, ymm4 | 5599 vpsrlw ymm1, ymm1, 8 |
| 5599 vpsrlw ymm0, ymm0, 7 | 5600 vpsrlw ymm0, ymm0, 8 |
| 5600 vpsrlw ymm1, ymm1, 7 | |
| 5601 vpackuswb ymm0, ymm0, ymm1 // unmutates | 5601 vpackuswb ymm0, ymm0, ymm1 // unmutates |
| 5602 vmovdqu [esi + edi], ymm0 | 5602 vmovdqu [esi + edi], ymm0 |
| 5603 lea esi, [esi + 32] | 5603 lea esi, [esi + 32] |
| 5604 sub ecx, 32 | 5604 sub ecx, 32 |
| 5605 jg xloop | 5605 jg xloop |
| 5606 jmp xloop99 | 5606 jmp xloop99 |
| 5607 | 5607 |
| 5608 // Blend 50 / 50. | 5608 // Blend 50 / 50. |
| 5609 xloop50: | 5609 xloop50: |
| 5610 vmovdqu ymm0, [esi] | 5610 vmovdqu ymm0, [esi] |
| (...skipping 11 matching lines...) Expand all Loading... |
| 5622 xloop99: | 5622 xloop99: |
| 5623 pop edi | 5623 pop edi |
| 5624 pop esi | 5624 pop esi |
| 5625 vzeroupper | 5625 vzeroupper |
| 5626 ret | 5626 ret |
| 5627 } | 5627 } |
| 5628 } | 5628 } |
| 5629 #endif // HAS_INTERPOLATEROW_AVX2 | 5629 #endif // HAS_INTERPOLATEROW_AVX2 |
| 5630 | 5630 |
| 5631 // Bilinear filter 16x2 -> 16x1 | 5631 // Bilinear filter 16x2 -> 16x1 |
| 5632 // TODO(fbarchard): Consider allowing 256 using memcpy. |
| 5632 __declspec(naked) | 5633 __declspec(naked) |
| 5633 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 5634 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 5634 ptrdiff_t src_stride, int dst_width, | 5635 ptrdiff_t src_stride, int dst_width, |
| 5635 int source_y_fraction) { | 5636 int source_y_fraction) { |
| 5636 __asm { | 5637 __asm { |
| 5637 push esi | 5638 push esi |
| 5638 push edi | 5639 push edi |
| 5640 |
| 5639 mov edi, [esp + 8 + 4] // dst_ptr | 5641 mov edi, [esp + 8 + 4] // dst_ptr |
| 5640 mov esi, [esp + 8 + 8] // src_ptr | 5642 mov esi, [esp + 8 + 8] // src_ptr |
| 5641 mov edx, [esp + 8 + 12] // src_stride | 5643 mov edx, [esp + 8 + 12] // src_stride |
| 5642 mov ecx, [esp + 8 + 16] // dst_width | 5644 mov ecx, [esp + 8 + 16] // dst_width |
| 5643 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) | 5645 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 5644 sub edi, esi | 5646 sub edi, esi |
| 5645 shr eax, 1 | |
| 5646 // Dispatch to specialized filters if applicable. | 5647 // Dispatch to specialized filters if applicable. |
| 5647 cmp eax, 0 | 5648 cmp eax, 0 |
| 5648 je xloop100 // 0 / 128. Blend 100 / 0. | 5649 je xloop100 // 0 /256. Blend 100 / 0. |
| 5649 cmp eax, 64 | 5650 cmp eax, 128 |
| 5650 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. | 5651 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
| 5651 | 5652 |
| 5652 movd xmm0, eax // high fraction 0..127 | 5653 movd xmm0, eax // high fraction 0..255 |
| 5653 neg eax | 5654 neg eax |
| 5654 add eax, 128 | 5655 add eax, 256 |
| 5655 movd xmm5, eax // low fraction 128..1 | 5656 movd xmm5, eax // low fraction 255..1 |
| 5656 punpcklbw xmm5, xmm0 | 5657 punpcklbw xmm5, xmm0 |
| 5657 punpcklwd xmm5, xmm5 | 5658 punpcklwd xmm5, xmm5 |
| 5658 pshufd xmm5, xmm5, 0 | 5659 pshufd xmm5, xmm5, 0 |
| 5659 | 5660 mov eax, 0x80808080 // 128 for biasing image to signed. |
| 5660 mov eax, 0x00400040 // 64 for rounding. | |
| 5661 movd xmm4, eax | 5661 movd xmm4, eax |
| 5662 pshufd xmm4, xmm4, 0x00 | 5662 pshufd xmm4, xmm4, 0x00 |
| 5663 | 5663 |
| 5664 xloop: | 5664 xloop: |
| 5665 movdqu xmm0, [esi] | 5665 movdqu xmm0, [esi] |
| 5666 movdqu xmm2, [esi + edx] | 5666 movdqu xmm2, [esi + edx] |
| 5667 movdqu xmm1, xmm0 | 5667 movdqu xmm1, xmm0 |
| 5668 punpcklbw xmm0, xmm2 | 5668 punpcklbw xmm0, xmm2 |
| 5669 punpckhbw xmm1, xmm2 | 5669 punpckhbw xmm1, xmm2 |
| 5670 pmaddubsw xmm0, xmm5 | 5670 psubb xmm0, xmm4 // bias image by -128 |
| 5671 pmaddubsw xmm1, xmm5 | 5671 psubb xmm1, xmm4 |
| 5672 paddw xmm0, xmm4 | 5672 movdqa xmm2, xmm5 |
| 5673 paddw xmm1, xmm4 | 5673 movdqa xmm3, xmm5 |
| 5674 psrlw xmm0, 7 | 5674 pmaddubsw xmm2, xmm0 |
| 5675 psrlw xmm1, 7 | 5675 pmaddubsw xmm3, xmm1 |
| 5676 packuswb xmm0, xmm1 | 5676 paddw xmm2, xmm4 |
| 5677 movdqu [esi + edi], xmm0 | 5677 paddw xmm3, xmm4 |
| 5678 psrlw xmm2, 8 |
| 5679 psrlw xmm3, 8 |
| 5680 packuswb xmm2, xmm3 |
| 5681 movdqu [esi + edi], xmm2 |
| 5678 lea esi, [esi + 16] | 5682 lea esi, [esi + 16] |
| 5679 sub ecx, 16 | 5683 sub ecx, 16 |
| 5680 jg xloop | 5684 jg xloop |
| 5681 jmp xloop99 | 5685 jmp xloop99 |
| 5682 | 5686 |
| 5683 // Blend 50 / 50. | 5687 // Blend 50 / 50. |
| 5684 xloop50: | 5688 xloop50: |
| 5685 movdqu xmm0, [esi] | 5689 movdqu xmm0, [esi] |
| 5686 movdqu xmm1, [esi + edx] | 5690 movdqu xmm1, [esi + edx] |
| 5687 pavgb xmm0, xmm1 | 5691 pavgb xmm0, xmm1 |
| (...skipping 531 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6219 } | 6223 } |
| 6220 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6224 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6221 | 6225 |
| 6222 #endif // defined(_M_X64) | 6226 #endif // defined(_M_X64) |
| 6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6227 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6224 | 6228 |
| 6225 #ifdef __cplusplus | 6229 #ifdef __cplusplus |
| 6226 } // extern "C" | 6230 } // extern "C" |
| 6227 } // namespace libyuv | 6231 } // namespace libyuv |
| 6228 #endif | 6232 #endif |
| OLD | NEW |