Index: source/libvpx/third_party/libyuv/source/scale_win.cc |
=================================================================== |
--- source/libvpx/third_party/libyuv/source/scale_win.cc (revision 0) |
+++ source/libvpx/third_party/libyuv/source/scale_win.cc (revision 0) |
@@ -0,0 +1,1320 @@ |
+/* |
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
+ * |
+ * Use of this source code is governed by a BSD-style license |
+ * that can be found in the LICENSE file in the root of the source |
+ * tree. An additional intellectual property rights grant can be found |
+ * in the file PATENTS. All contributing project authors may |
+ * be found in the AUTHORS file in the root of the source tree. |
+ */ |
+ |
+#include "third_party/libyuv/include/libyuv/row.h" |
+ |
+#ifdef __cplusplus |
+namespace libyuv { |
+extern "C" { |
+#endif |
+ |
+// This module is for Visual C x86. |
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
+ |
+// Offsets for source bytes 0 to 9 |
+static uvec8 kShuf0 = |
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
+ |
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
+static uvec8 kShuf1 = |
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
+ |
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
+static uvec8 kShuf2 = |
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
+ |
+// Offsets for source bytes 0 to 10 |
+static uvec8 kShuf01 = |
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
+ |
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
+static uvec8 kShuf11 = |
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
+ |
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
+static uvec8 kShuf21 = |
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
+ |
+// Coefficients for source bytes 0 to 10 |
+static uvec8 kMadd01 = |
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
+ |
+// Coefficients for source bytes 10 to 21 |
+static uvec8 kMadd11 = |
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
+ |
+// Coefficients for source bytes 21 to 31 |
+static uvec8 kMadd21 = |
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
+ |
+// Coefficients for source bytes 21 to 31 |
+static vec16 kRound34 = |
+ { 2, 2, 2, 2, 2, 2, 2, 2 }; |
+ |
+static uvec8 kShuf38a = |
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
+ |
+static uvec8 kShuf38b = |
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
+ |
+// Arrange words 0,3,6 into 0,1,2 |
+static uvec8 kShufAc = |
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
+ |
+// Arrange words 0,3,6 into 3,4,5 |
+static uvec8 kShufAc3 = |
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
+ |
+// Scaling values for boxes of 3x3 and 2x3 |
+static uvec16 kScaleAc33 = |
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
+ |
+// Arrange first value for pixels 0,1,2,3,4,5 |
+static uvec8 kShufAb0 = |
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
+ |
+// Arrange second value for pixels 0,1,2,3,4,5 |
+static uvec8 kShufAb1 = |
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
+ |
+// Arrange third value for pixels 0,1,2,3,4,5 |
+static uvec8 kShufAb2 = |
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
+ |
+// Scaling values for boxes of 3x2 and 2x2 |
+static uvec16 kScaleAb2 = |
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
+ |
+// Reads 32 pixels, throws half away and writes 16 pixels. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ psrlw xmm0, 8 // isolate odd pixels. |
+ psrlw xmm1, 8 |
+ packuswb xmm0, xmm1 |
+ sub ecx, 16 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 32x1 rectangle to 16x1. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
+ psrlw xmm5, 8 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ |
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
+ psrlw xmm0, 8 |
+ movdqa xmm3, xmm1 |
+ psrlw xmm1, 8 |
+ pand xmm2, xmm5 |
+ pand xmm3, xmm5 |
+ pavgw xmm0, xmm2 |
+ pavgw xmm1, xmm3 |
+ packuswb xmm0, xmm1 |
+ |
+ sub ecx, 16 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 32x2 rectangle to 16x1. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_ptr |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_ptr |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
+ psrlw xmm5, 8 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ movdqa xmm2, [eax + esi] |
+ movdqa xmm3, [eax + esi + 16] |
+ lea eax, [eax + 32] |
+ pavgb xmm0, xmm2 // average rows |
+ pavgb xmm1, xmm3 |
+ |
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
+ psrlw xmm0, 8 |
+ movdqa xmm3, xmm1 |
+ psrlw xmm1, 8 |
+ pand xmm2, xmm5 |
+ pand xmm3, xmm5 |
+ pavgw xmm0, xmm2 |
+ pavgw xmm1, xmm3 |
+ packuswb xmm0, xmm1 |
+ |
+ sub ecx, 16 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Reads 32 pixels, throws half away and writes 16 pixels. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqu xmm0, [eax] |
+ movdqu xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ psrlw xmm0, 8 // isolate odd pixels. |
+ psrlw xmm1, 8 |
+ packuswb xmm0, xmm1 |
+ sub ecx, 16 |
+ movdqu [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 32x1 rectangle to 16x1. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
+ psrlw xmm5, 8 |
+ |
+ align 4 |
+ wloop: |
+ movdqu xmm0, [eax] |
+ movdqu xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ |
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
+ psrlw xmm0, 8 |
+ movdqa xmm3, xmm1 |
+ psrlw xmm1, 8 |
+ pand xmm2, xmm5 |
+ pand xmm3, xmm5 |
+ pavgw xmm0, xmm2 |
+ pavgw xmm1, xmm3 |
+ packuswb xmm0, xmm1 |
+ |
+ sub ecx, 16 |
+ movdqu [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 32x2 rectangle to 16x1. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_ptr |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_ptr |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
+ psrlw xmm5, 8 |
+ |
+ align 4 |
+ wloop: |
+ movdqu xmm0, [eax] |
+ movdqu xmm1, [eax + 16] |
+ movdqu xmm2, [eax + esi] |
+ movdqu xmm3, [eax + esi + 16] |
+ lea eax, [eax + 32] |
+ pavgb xmm0, xmm2 // average rows |
+ pavgb xmm1, xmm3 |
+ |
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
+ psrlw xmm0, 8 |
+ movdqa xmm3, xmm1 |
+ psrlw xmm1, 8 |
+ pand xmm2, xmm5 |
+ pand xmm3, xmm5 |
+ pavgw xmm0, xmm2 |
+ pavgw xmm1, xmm3 |
+ packuswb xmm0, xmm1 |
+ |
+ sub ecx, 16 |
+ movdqu [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Point samples 32 pixels to 8 pixels. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 |
+ psrld xmm5, 24 |
+ pslld xmm5, 16 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ pand xmm0, xmm5 |
+ pand xmm1, xmm5 |
+ packuswb xmm0, xmm1 |
+ psrlw xmm0, 8 |
+ packuswb xmm0, xmm0 |
+ sub ecx, 8 |
+ movq qword ptr [edx], xmm0 |
+ lea edx, [edx + 8] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 32x4 rectangle to 8x1. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ push edi |
+ mov eax, [esp + 8 + 4] // src_ptr |
+ mov esi, [esp + 8 + 8] // src_stride |
+ mov edx, [esp + 8 + 12] // dst_ptr |
+ mov ecx, [esp + 8 + 16] // dst_width |
+ lea edi, [esi + esi * 2] // src_stride * 3 |
+ pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff |
+ psrlw xmm7, 8 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ movdqa xmm2, [eax + esi] |
+ movdqa xmm3, [eax + esi + 16] |
+ pavgb xmm0, xmm2 // average rows |
+ pavgb xmm1, xmm3 |
+ movdqa xmm2, [eax + esi * 2] |
+ movdqa xmm3, [eax + esi * 2 + 16] |
+ movdqa xmm4, [eax + edi] |
+ movdqa xmm5, [eax + edi + 16] |
+ lea eax, [eax + 32] |
+ pavgb xmm2, xmm4 |
+ pavgb xmm3, xmm5 |
+ pavgb xmm0, xmm2 |
+ pavgb xmm1, xmm3 |
+ |
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
+ psrlw xmm0, 8 |
+ movdqa xmm3, xmm1 |
+ psrlw xmm1, 8 |
+ pand xmm2, xmm7 |
+ pand xmm3, xmm7 |
+ pavgw xmm0, xmm2 |
+ pavgw xmm1, xmm3 |
+ packuswb xmm0, xmm1 |
+ |
+ movdqa xmm2, xmm0 // average columns (16 to 8 pixels) |
+ psrlw xmm0, 8 |
+ pand xmm2, xmm7 |
+ pavgw xmm0, xmm2 |
+ packuswb xmm0, xmm0 |
+ |
+ sub ecx, 8 |
+ movq qword ptr [edx], xmm0 |
+ lea edx, [edx + 8] |
+ jg wloop |
+ |
+ pop edi |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Point samples 32 pixels to 24 pixels. |
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
+// Then shuffled to do the scaling. |
+ |
+// Note that movdqa+palign may be better than movdqu. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ movdqa xmm3, kShuf0 |
+ movdqa xmm4, kShuf1 |
+ movdqa xmm5, kShuf2 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ movdqa xmm2, xmm1 |
+ palignr xmm1, xmm0, 8 |
+ pshufb xmm0, xmm3 |
+ pshufb xmm1, xmm4 |
+ pshufb xmm2, xmm5 |
+ movq qword ptr [edx], xmm0 |
+ movq qword ptr [edx + 8], xmm1 |
+ movq qword ptr [edx + 16], xmm2 |
+ lea edx, [edx + 24] |
+ sub ecx, 24 |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 32x2 rectangle to 24x1 |
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
+// Then shuffled to do the scaling. |
+ |
+// Register usage: |
+// xmm0 src_row 0 |
+// xmm1 src_row 1 |
+// xmm2 shuf 0 |
+// xmm3 shuf 1 |
+// xmm4 shuf 2 |
+// xmm5 madd 0 |
+// xmm6 madd 1 |
+// xmm7 kRound34 |
+ |
+// Note that movdqa+palign may be better than movdqu. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_ptr |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_ptr |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ movdqa xmm2, kShuf01 |
+ movdqa xmm3, kShuf11 |
+ movdqa xmm4, kShuf21 |
+ movdqa xmm5, kMadd01 |
+ movdqa xmm6, kMadd11 |
+ movdqa xmm7, kRound34 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] // pixels 0..7 |
+ movdqa xmm1, [eax + esi] |
+ pavgb xmm0, xmm1 |
+ pshufb xmm0, xmm2 |
+ pmaddubsw xmm0, xmm5 |
+ paddsw xmm0, xmm7 |
+ psrlw xmm0, 2 |
+ packuswb xmm0, xmm0 |
+ movq qword ptr [edx], xmm0 |
+ movdqu xmm0, [eax + 8] // pixels 8..15 |
+ movdqu xmm1, [eax + esi + 8] |
+ pavgb xmm0, xmm1 |
+ pshufb xmm0, xmm3 |
+ pmaddubsw xmm0, xmm6 |
+ paddsw xmm0, xmm7 |
+ psrlw xmm0, 2 |
+ packuswb xmm0, xmm0 |
+ movq qword ptr [edx + 8], xmm0 |
+ movdqa xmm0, [eax + 16] // pixels 16..23 |
+ movdqa xmm1, [eax + esi + 16] |
+ lea eax, [eax + 32] |
+ pavgb xmm0, xmm1 |
+ pshufb xmm0, xmm4 |
+ movdqa xmm1, kMadd21 |
+ pmaddubsw xmm0, xmm1 |
+ paddsw xmm0, xmm7 |
+ psrlw xmm0, 2 |
+ packuswb xmm0, xmm0 |
+ sub ecx, 24 |
+ movq qword ptr [edx + 16], xmm0 |
+ lea edx, [edx + 24] |
+ jg wloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Note that movdqa+palign may be better than movdqu. |
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_ptr |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_ptr |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ movdqa xmm2, kShuf01 |
+ movdqa xmm3, kShuf11 |
+ movdqa xmm4, kShuf21 |
+ movdqa xmm5, kMadd01 |
+ movdqa xmm6, kMadd11 |
+ movdqa xmm7, kRound34 |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] // pixels 0..7 |
+ movdqa xmm1, [eax + esi] |
+ pavgb xmm1, xmm0 |
+ pavgb xmm0, xmm1 |
+ pshufb xmm0, xmm2 |
+ pmaddubsw xmm0, xmm5 |
+ paddsw xmm0, xmm7 |
+ psrlw xmm0, 2 |
+ packuswb xmm0, xmm0 |
+ movq qword ptr [edx], xmm0 |
+ movdqu xmm0, [eax + 8] // pixels 8..15 |
+ movdqu xmm1, [eax + esi + 8] |
+ pavgb xmm1, xmm0 |
+ pavgb xmm0, xmm1 |
+ pshufb xmm0, xmm3 |
+ pmaddubsw xmm0, xmm6 |
+ paddsw xmm0, xmm7 |
+ psrlw xmm0, 2 |
+ packuswb xmm0, xmm0 |
+ movq qword ptr [edx + 8], xmm0 |
+ movdqa xmm0, [eax + 16] // pixels 16..23 |
+ movdqa xmm1, [eax + esi + 16] |
+ lea eax, [eax + 32] |
+ pavgb xmm1, xmm0 |
+ pavgb xmm0, xmm1 |
+ pshufb xmm0, xmm4 |
+ movdqa xmm1, kMadd21 |
+ pmaddubsw xmm0, xmm1 |
+ paddsw xmm0, xmm7 |
+ psrlw xmm0, 2 |
+ packuswb xmm0, xmm0 |
+ sub ecx, 24 |
+ movq qword ptr [edx + 16], xmm0 |
+ lea edx, [edx+24] |
+ jg wloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// 3/8 point sampler |
+ |
+// Scale 32 pixels to 12 |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_ptr |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_ptr |
+ mov ecx, [esp + 16] // dst_width |
+ movdqa xmm4, kShuf38a |
+ movdqa xmm5, kShuf38b |
+ |
+ align 4 |
+ xloop: |
+ movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 |
+ movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 |
+ lea eax, [eax + 32] |
+ pshufb xmm0, xmm4 |
+ pshufb xmm1, xmm5 |
+ paddusb xmm0, xmm1 |
+ |
+ sub ecx, 12 |
+ movq qword ptr [edx], xmm0 // write 12 pixels |
+ movhlps xmm1, xmm0 |
+ movd [edx + 8], xmm1 |
+ lea edx, [edx + 12] |
+ jg xloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Scale 16x3 pixels to 6x1 with interpolation |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_ptr |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_ptr |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ movdqa xmm2, kShufAc |
+ movdqa xmm3, kShufAc3 |
+ movdqa xmm4, kScaleAc33 |
+ pxor xmm5, xmm5 |
+ |
+ align 4 |
+ xloop: |
+ movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 |
+ movdqa xmm6, [eax + esi] |
+ movhlps xmm1, xmm0 |
+ movhlps xmm7, xmm6 |
+ punpcklbw xmm0, xmm5 |
+ punpcklbw xmm1, xmm5 |
+ punpcklbw xmm6, xmm5 |
+ punpcklbw xmm7, xmm5 |
+ paddusw xmm0, xmm6 |
+ paddusw xmm1, xmm7 |
+ movdqa xmm6, [eax + esi * 2] |
+ lea eax, [eax + 16] |
+ movhlps xmm7, xmm6 |
+ punpcklbw xmm6, xmm5 |
+ punpcklbw xmm7, xmm5 |
+ paddusw xmm0, xmm6 |
+ paddusw xmm1, xmm7 |
+ |
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 |
+ psrldq xmm0, 2 |
+ paddusw xmm6, xmm0 |
+ psrldq xmm0, 2 |
+ paddusw xmm6, xmm0 |
+ pshufb xmm6, xmm2 |
+ |
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 |
+ psrldq xmm1, 2 |
+ paddusw xmm7, xmm1 |
+ psrldq xmm1, 2 |
+ paddusw xmm7, xmm1 |
+ pshufb xmm7, xmm3 |
+ paddusw xmm6, xmm7 |
+ |
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 |
+ packuswb xmm6, xmm6 |
+ |
+ sub ecx, 6 |
+ movd [edx], xmm6 // write 6 pixels |
+ psrlq xmm6, 16 |
+ movd [edx + 2], xmm6 |
+ lea edx, [edx + 6] |
+ jg xloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Scale 16x2 pixels to 6x1 with interpolation |
+__declspec(naked) __declspec(align(16)) |
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
+ ptrdiff_t src_stride, |
+ uint8* dst_ptr, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_ptr |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_ptr |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ movdqa xmm2, kShufAb0 |
+ movdqa xmm3, kShufAb1 |
+ movdqa xmm4, kShufAb2 |
+ movdqa xmm5, kScaleAb2 |
+ |
+ align 4 |
+ xloop: |
+ movdqa xmm0, [eax] // average 2 rows into xmm0 |
+ pavgb xmm0, [eax + esi] |
+ lea eax, [eax + 16] |
+ |
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 |
+ pshufb xmm1, xmm2 |
+ movdqa xmm6, xmm0 |
+ pshufb xmm6, xmm3 |
+ paddusw xmm1, xmm6 |
+ pshufb xmm0, xmm4 |
+ paddusw xmm1, xmm0 |
+ |
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 |
+ packuswb xmm1, xmm1 |
+ |
+ sub ecx, 6 |
+ movd [edx], xmm1 // write 6 pixels |
+ psrlq xmm1, 16 |
+ movd [edx + 2], xmm1 |
+ lea edx, [edx + 6] |
+ jg xloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Reads 16xN bytes and produces 16 shorts at a time. |
+// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
+ uint16* dst_ptr, int src_width, |
+ int src_height) { |
+ __asm { |
+ push esi |
+ push edi |
+ push ebx |
+ push ebp |
+ mov esi, [esp + 16 + 4] // src_ptr |
+ mov edx, [esp + 16 + 8] // src_stride |
+ mov edi, [esp + 16 + 12] // dst_ptr |
+ mov ecx, [esp + 16 + 16] // dst_width |
+ mov ebx, [esp + 16 + 20] // height |
+ pxor xmm4, xmm4 |
+ dec ebx |
+ |
+ align 4 |
+ xloop: |
+ // first row |
+ movdqa xmm0, [esi] |
+ lea eax, [esi + edx] |
+ movdqa xmm1, xmm0 |
+ punpcklbw xmm0, xmm4 |
+ punpckhbw xmm1, xmm4 |
+ lea esi, [esi + 16] |
+ mov ebp, ebx |
+ test ebp, ebp |
+ je ydone |
+ |
+ // sum remaining rows |
+ align 4 |
+ yloop: |
+ movdqa xmm2, [eax] // read 16 pixels |
+ lea eax, [eax + edx] // advance to next row |
+ movdqa xmm3, xmm2 |
+ punpcklbw xmm2, xmm4 |
+ punpckhbw xmm3, xmm4 |
+ paddusw xmm0, xmm2 // sum 16 words |
+ paddusw xmm1, xmm3 |
+ sub ebp, 1 |
+ jg yloop |
+ |
+ align 4 |
+ ydone: |
+ movdqa [edi], xmm0 |
+ movdqa [edi + 16], xmm1 |
+ lea edi, [edi + 32] |
+ |
+ sub ecx, 16 |
+ jg xloop |
+ |
+ pop ebp |
+ pop ebx |
+ pop edi |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Bilinear column filtering. SSSE3 version. |
+// TODO(fbarchard): Port to Neon |
+// TODO(fbarchard): Switch the following: |
+// xor ebx, ebx |
+// mov bx, word ptr [esi + eax] // 2 source x0 pixels |
+// To |
+// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
+// when drmemory bug fixed. |
+// https://code.google.com/p/drmemory/issues/detail?id=1396 |
+ |
+__declspec(naked) __declspec(align(16)) |
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
+ int dst_width, int x, int dx) { |
+ __asm { |
+ push ebx |
+ push esi |
+ push edi |
+ mov edi, [esp + 12 + 4] // dst_ptr |
+ mov esi, [esp + 12 + 8] // src_ptr |
+ mov ecx, [esp + 12 + 12] // dst_width |
+ movd xmm2, [esp + 12 + 16] // x |
+ movd xmm3, [esp + 12 + 20] // dx |
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel. |
+ movd xmm5, eax |
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
+ psrlw xmm6, 9 |
+ pextrw eax, xmm2, 1 // get x0 integer. preroll |
+ sub ecx, 2 |
+ jl xloop29 |
+ |
+ movdqa xmm0, xmm2 // x1 = x0 + dx |
+ paddd xmm0, xmm3 |
+ punpckldq xmm2, xmm0 // x0 x1 |
+ punpckldq xmm3, xmm3 // dx dx |
+ paddd xmm3, xmm3 // dx * 2, dx * 2 |
+ pextrw edx, xmm2, 3 // get x1 integer. preroll |
+ |
+ // 2 Pixel loop. |
+ align 4 |
+ xloop2: |
+ movdqa xmm1, xmm2 // x0, x1 fractions. |
+ paddd xmm2, xmm3 // x += dx |
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
+ movd xmm0, ebx |
+ psrlw xmm1, 9 // 7 bit fractions. |
+ movzx ebx, word ptr [esi + edx] // 2 source x1 pixels |
+ movd xmm4, ebx |
+ pshufb xmm1, xmm5 // 0011 |
+ punpcklwd xmm0, xmm4 |
+ pxor xmm1, xmm6 // 0..7f and 7f..0 |
+ pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. |
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
+ packuswb xmm0, xmm0 // 8 bits, 2 pixels. |
+ movd ebx, xmm0 |
+ mov [edi], bx |
+ lea edi, [edi + 2] |
+ sub ecx, 2 // 2 pixels |
+ jge xloop2 |
+ |
+ align 4 |
+ xloop29: |
+ |
+ add ecx, 2 - 1 |
+ jl xloop99 |
+ |
+ // 1 pixel remainder |
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
+ movd xmm0, ebx |
+ psrlw xmm2, 9 // 7 bit fractions. |
+ pshufb xmm2, xmm5 // 0011 |
+ pxor xmm2, xmm6 // 0..7f and 7f..0 |
+ pmaddubsw xmm0, xmm2 // 16 bit |
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
+ packuswb xmm0, xmm0 // 8 bits |
+ movd ebx, xmm0 |
+ mov [edi], bl |
+ |
+ align 4 |
+ xloop99: |
+ |
+ pop edi |
+ pop esi |
+ pop ebx |
+ ret |
+ } |
+} |
+ |
+// Reads 16 pixels, duplicates them and writes 32 pixels. |
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
+ int dst_width, int x, int dx) { |
+ __asm { |
+ mov edx, [esp + 4] // dst_ptr |
+ mov eax, [esp + 8] // src_ptr |
+ mov ecx, [esp + 12] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ lea eax, [eax + 16] |
+ movdqa xmm1, xmm0 |
+ punpcklbw xmm0, xmm0 |
+ punpckhbw xmm1, xmm1 |
+ sub ecx, 32 |
+ movdqa [edx], xmm0 |
+ movdqa [edx + 16], xmm1 |
+ lea edx, [edx + 32] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) |
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
+ ptrdiff_t src_stride, |
+ uint8* dst_argb, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_argb |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_argb |
+ mov ecx, [esp + 16] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ shufps xmm0, xmm1, 0xdd |
+ sub ecx, 4 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 8x1 rectangle to 4x1. |
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
+ ptrdiff_t src_stride, |
+ uint8* dst_argb, int dst_width) { |
+ __asm { |
+ mov eax, [esp + 4] // src_argb |
+ // src_stride ignored |
+ mov edx, [esp + 12] // dst_argb |
+ mov ecx, [esp + 16] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ lea eax, [eax + 32] |
+ movdqa xmm2, xmm0 |
+ shufps xmm0, xmm1, 0x88 // even pixels |
+ shufps xmm2, xmm1, 0xdd // odd pixels |
+ pavgb xmm0, xmm2 |
+ sub ecx, 4 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Blends 8x2 rectangle to 4x1. |
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
+ ptrdiff_t src_stride, |
+ uint8* dst_argb, int dst_width) { |
+ __asm { |
+ push esi |
+ mov eax, [esp + 4 + 4] // src_argb |
+ mov esi, [esp + 4 + 8] // src_stride |
+ mov edx, [esp + 4 + 12] // dst_argb |
+ mov ecx, [esp + 4 + 16] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ movdqa xmm1, [eax + 16] |
+ movdqa xmm2, [eax + esi] |
+ movdqa xmm3, [eax + esi + 16] |
+ lea eax, [eax + 32] |
+ pavgb xmm0, xmm2 // average rows |
+ pavgb xmm1, xmm3 |
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
+ shufps xmm0, xmm1, 0x88 // even pixels |
+ shufps xmm2, xmm1, 0xdd // odd pixels |
+ pavgb xmm0, xmm2 |
+ sub ecx, 4 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Reads 4 pixels at a time. |
+// Alignment requirement: dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
+ int src_stepx, |
+ uint8* dst_argb, int dst_width) { |
+ __asm { |
+ push ebx |
+ push edi |
+ mov eax, [esp + 8 + 4] // src_argb |
+ // src_stride ignored |
+ mov ebx, [esp + 8 + 12] // src_stepx |
+ mov edx, [esp + 8 + 16] // dst_argb |
+ mov ecx, [esp + 8 + 20] // dst_width |
+ lea ebx, [ebx * 4] |
+ lea edi, [ebx + ebx * 2] |
+ |
+ align 4 |
+ wloop: |
+ movd xmm0, [eax] |
+ movd xmm1, [eax + ebx] |
+ punpckldq xmm0, xmm1 |
+ movd xmm2, [eax + ebx * 2] |
+ movd xmm3, [eax + edi] |
+ lea eax, [eax + ebx * 4] |
+ punpckldq xmm2, xmm3 |
+ punpcklqdq xmm0, xmm2 |
+ sub ecx, 4 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ pop edi |
+ pop ebx |
+ ret |
+ } |
+} |
+ |
+// Blends four 2x2 to 4x1. |
+// Alignment requirement: dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
+ ptrdiff_t src_stride, |
+ int src_stepx, |
+ uint8* dst_argb, int dst_width) { |
+ __asm { |
+ push ebx |
+ push esi |
+ push edi |
+ mov eax, [esp + 12 + 4] // src_argb |
+ mov esi, [esp + 12 + 8] // src_stride |
+ mov ebx, [esp + 12 + 12] // src_stepx |
+ mov edx, [esp + 12 + 16] // dst_argb |
+ mov ecx, [esp + 12 + 20] // dst_width |
+ lea esi, [eax + esi] // row1 pointer |
+ lea ebx, [ebx * 4] |
+ lea edi, [ebx + ebx * 2] |
+ |
+ align 4 |
+ wloop: |
+ movq xmm0, qword ptr [eax] // row0 4 pairs |
+ movhps xmm0, qword ptr [eax + ebx] |
+ movq xmm1, qword ptr [eax + ebx * 2] |
+ movhps xmm1, qword ptr [eax + edi] |
+ lea eax, [eax + ebx * 4] |
+ movq xmm2, qword ptr [esi] // row1 4 pairs |
+ movhps xmm2, qword ptr [esi + ebx] |
+ movq xmm3, qword ptr [esi + ebx * 2] |
+ movhps xmm3, qword ptr [esi + edi] |
+ lea esi, [esi + ebx * 4] |
+ pavgb xmm0, xmm2 // average rows |
+ pavgb xmm1, xmm3 |
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
+ shufps xmm0, xmm1, 0x88 // even pixels |
+ shufps xmm2, xmm1, 0xdd // odd pixels |
+ pavgb xmm0, xmm2 |
+ sub ecx, 4 |
+ movdqa [edx], xmm0 |
+ lea edx, [edx + 16] |
+ jg wloop |
+ |
+ pop edi |
+ pop esi |
+ pop ebx |
+ ret |
+ } |
+} |
+ |
+// Column scaling unfiltered. SSE2 version. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
+ int dst_width, int x, int dx) { |
+ __asm { |
+ push edi |
+ push esi |
+ mov edi, [esp + 8 + 4] // dst_argb |
+ mov esi, [esp + 8 + 8] // src_argb |
+ mov ecx, [esp + 8 + 12] // dst_width |
+ movd xmm2, [esp + 8 + 16] // x |
+ movd xmm3, [esp + 8 + 20] // dx |
+ |
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 |
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 |
+ paddd xmm2, xmm0 |
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2 |
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 |
+ paddd xmm2, xmm0 // x3 x2 x1 x0 |
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4 |
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 |
+ |
+ pextrw eax, xmm2, 1 // get x0 integer. |
+ pextrw edx, xmm2, 3 // get x1 integer. |
+ |
+ cmp ecx, 0 |
+ jle xloop99 |
+ sub ecx, 4 |
+ jl xloop49 |
+ |
+ // 4 Pixel loop. |
+ align 4 |
+ xloop4: |
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
+ pextrw eax, xmm2, 5 // get x2 integer. |
+ pextrw edx, xmm2, 7 // get x3 integer. |
+ paddd xmm2, xmm3 // x += dx |
+ punpckldq xmm0, xmm1 // x0 x1 |
+ |
+ movd xmm1, [esi + eax * 4] // 1 source x2 pixels |
+ movd xmm4, [esi + edx * 4] // 1 source x3 pixels |
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
+ punpckldq xmm1, xmm4 // x2 x3 |
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 |
+ sub ecx, 4 // 4 pixels |
+ movdqu [edi], xmm0 |
+ lea edi, [edi + 16] |
+ jge xloop4 |
+ |
+ align 4 |
+ xloop49: |
+ test ecx, 2 |
+ je xloop29 |
+ |
+ // 2 Pixels. |
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
+ pextrw eax, xmm2, 5 // get x2 integer. |
+ punpckldq xmm0, xmm1 // x0 x1 |
+ |
+ movq qword ptr [edi], xmm0 |
+ lea edi, [edi + 8] |
+ |
+ xloop29: |
+ test ecx, 1 |
+ je xloop99 |
+ |
+ // 1 Pixels. |
+ movd xmm0, [esi + eax * 4] // 1 source x2 pixels |
+ movd dword ptr [edi], xmm0 |
+ align 4 |
+ xloop99: |
+ |
+ pop esi |
+ pop edi |
+ ret |
+ } |
+} |
+ |
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. |
+// TODO(fbarchard): Port to Neon |
+ |
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
+static uvec8 kShuffleColARGB = { |
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
+}; |
+ |
+// Shuffle table for duplicating 2 fractions into 8 bytes each |
+static uvec8 kShuffleFractions = { |
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
+}; |
+ |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
+ int dst_width, int x, int dx) { |
+ __asm { |
+ push esi |
+ push edi |
+ mov edi, [esp + 8 + 4] // dst_argb |
+ mov esi, [esp + 8 + 8] // src_argb |
+ mov ecx, [esp + 8 + 12] // dst_width |
+ movd xmm2, [esp + 8 + 16] // x |
+ movd xmm3, [esp + 8 + 20] // dx |
+ movdqa xmm4, kShuffleColARGB |
+ movdqa xmm5, kShuffleFractions |
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
+ psrlw xmm6, 9 |
+ pextrw eax, xmm2, 1 // get x0 integer. preroll |
+ sub ecx, 2 |
+ jl xloop29 |
+ |
+ movdqa xmm0, xmm2 // x1 = x0 + dx |
+ paddd xmm0, xmm3 |
+ punpckldq xmm2, xmm0 // x0 x1 |
+ punpckldq xmm3, xmm3 // dx dx |
+ paddd xmm3, xmm3 // dx * 2, dx * 2 |
+ pextrw edx, xmm2, 3 // get x1 integer. preroll |
+ |
+ // 2 Pixel loop. |
+ align 4 |
+ xloop2: |
+ movdqa xmm1, xmm2 // x0, x1 fractions. |
+ paddd xmm2, xmm3 // x += dx |
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
+ psrlw xmm1, 9 // 7 bit fractions. |
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels |
+ pshufb xmm1, xmm5 // 0000000011111111 |
+ pshufb xmm0, xmm4 // arrange pixels into pairs |
+ pxor xmm1, xmm6 // 0..7f and 7f..0 |
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. |
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. |
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. |
+ movq qword ptr [edi], xmm0 |
+ lea edi, [edi + 8] |
+ sub ecx, 2 // 2 pixels |
+ jge xloop2 |
+ |
+ align 4 |
+ xloop29: |
+ |
+ add ecx, 2 - 1 |
+ jl xloop99 |
+ |
+ // 1 pixel remainder |
+ psrlw xmm2, 9 // 7 bit fractions. |
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
+ pshufb xmm2, xmm5 // 00000000 |
+ pshufb xmm0, xmm4 // arrange pixels into pairs |
+ pxor xmm2, xmm6 // 0..7f and 7f..0 |
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. |
+ psrlw xmm0, 7 |
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. |
+ movd [edi], xmm0 |
+ |
+ align 4 |
+ xloop99: |
+ |
+ pop edi |
+ pop esi |
+ ret |
+ } |
+} |
+ |
+// Reads 4 pixels, duplicates them and writes 8 pixels. |
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
+__declspec(naked) __declspec(align(16)) |
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
+ int dst_width, int x, int dx) { |
+ __asm { |
+ mov edx, [esp + 4] // dst_argb |
+ mov eax, [esp + 8] // src_argb |
+ mov ecx, [esp + 12] // dst_width |
+ |
+ align 4 |
+ wloop: |
+ movdqa xmm0, [eax] |
+ lea eax, [eax + 16] |
+ movdqa xmm1, xmm0 |
+ punpckldq xmm0, xmm0 |
+ punpckhdq xmm1, xmm1 |
+ sub ecx, 8 |
+ movdqa [edx], xmm0 |
+ movdqa [edx + 16], xmm1 |
+ lea edx, [edx + 32] |
+ jg wloop |
+ |
+ ret |
+ } |
+} |
+ |
+// Divide num by div and return as 16.16 fixed point result. |
+__declspec(naked) __declspec(align(16)) |
+int FixedDiv_X86(int num, int div) { |
+ __asm { |
+ mov eax, [esp + 4] // num |
+ cdq // extend num to 64 bits |
+ shld edx, eax, 16 // 32.16 |
+ shl eax, 16 |
+ idiv dword ptr [esp + 8] |
+ ret |
+ } |
+} |
+ |
+// Divide num by div and return as 16.16 fixed point result. |
+__declspec(naked) __declspec(align(16)) |
+int FixedDiv1_X86(int num, int div) { |
+ __asm { |
+ mov eax, [esp + 4] // num |
+ mov ecx, [esp + 8] // denom |
+ cdq // extend num to 64 bits |
+ shld edx, eax, 16 // 32.16 |
+ shl eax, 16 |
+ sub eax, 0x00010001 |
+ sbb edx, 0 |
+ sub ecx, 1 |
+ idiv ecx |
+ ret |
+ } |
+} |
+ |
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
+ |
+#ifdef __cplusplus |
+} // extern "C" |
+} // namespace libyuv |
+#endif |