Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c |
diff --git a/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c |
index 71dbb402dd43b0c99991c37a31878199b393706a..5fd2857e14036710bf8006d746636c5a450df13d 100644 |
--- a/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c |
+++ b/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c |
@@ -8,7 +8,14 @@ |
* be found in the AUTHORS file in the root of the source tree. |
*/ |
+// Due to a header conflict between math.h and intrinsics includes with ceil() |
+// in certain configurations under vs9 this include needs to precede |
+// tmmintrin.h. |
+#include "./vp9_rtcd.h" |
+ |
#include <tmmintrin.h> |
+ |
+#include "vp9/common/x86/convolve.h" |
#include "vpx_ports/mem.h" |
#include "vpx_ports/emmintrin_compat.h" |
@@ -38,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { |
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
}; |
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, |
- unsigned int src_pixels_per_line, |
- unsigned char *output_ptr, |
- unsigned int output_pitch, |
- unsigned int output_height, |
- int16_t *filter) { |
+// These are reused by the avx2 intrinsics. |
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
+ |
+void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, |
+ ptrdiff_t src_pixels_per_line, |
+ uint8_t *output_ptr, |
+ ptrdiff_t output_pitch, |
+ uint32_t output_height, |
+ const int16_t *filter) { |
__m128i firstFilters, secondFilters, shuffle1, shuffle2; |
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
__m128i addFilterReg64, filtersReg, srcReg, minReg; |
@@ -51,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, |
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
addFilterReg64 =_mm_set1_epi32((int)0x0400040u); |
- filtersReg = _mm_loadu_si128((__m128i *)filter); |
+ filtersReg = _mm_loadu_si128((const __m128i *)filter); |
// converting the 16 bit (short) to 8 bit (byte) and have the same data |
// in both lanes of 128 bit register. |
filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
@@ -72,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, |
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); |
for (i = 0; i < output_height; i++) { |
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
// filter the source buffer |
srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); |
@@ -109,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, |
} |
} |
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, |
- unsigned int src_pixels_per_line, |
- unsigned char *output_ptr, |
- unsigned int output_pitch, |
- unsigned int output_height, |
- int16_t *filter) { |
+void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, |
+ ptrdiff_t src_pixels_per_line, |
+ uint8_t *output_ptr, |
+ ptrdiff_t output_pitch, |
+ uint32_t output_height, |
+ const int16_t *filter) { |
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; |
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
@@ -123,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, |
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
- filtersReg = _mm_loadu_si128((__m128i *)filter); |
+ filtersReg = _mm_loadu_si128((const __m128i *)filter); |
// converting the 16 bit (short) to 8 bit (byte) and have the same data |
// in both lanes of 128 bit register. |
filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
@@ -147,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, |
filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
for (i = 0; i < output_height; i++) { |
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
// filter the source buffer |
srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); |
@@ -189,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, |
} |
} |
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, |
- unsigned int src_pixels_per_line, |
- unsigned char *output_ptr, |
- unsigned int output_pitch, |
- unsigned int output_height, |
- int16_t *filter) { |
+static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, |
+ ptrdiff_t src_pixels_per_line, |
+ uint8_t *output_ptr, |
+ ptrdiff_t output_pitch, |
+ uint32_t output_height, |
+ const int16_t *filter) { |
__m128i addFilterReg64, filtersReg, srcReg1, srcReg2; |
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
__m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
@@ -203,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, |
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
- filtersReg = _mm_loadu_si128((__m128i *)filter); |
+ filtersReg = _mm_loadu_si128((const __m128i *)filter); |
// converting the 16 bit (short) to 8 bit (byte) and have the same data |
// in both lanes of 128 bit register. |
filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
@@ -227,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, |
filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
for (i = 0; i < output_height; i++) { |
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
// filter the source buffer |
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); |
@@ -254,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, |
// reading the next 16 bytes. |
// (part of it was being read by earlier read) |
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); |
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); |
// add and saturate the results together |
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
@@ -306,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, |
} |
} |
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, |
- unsigned int src_pitch, |
- unsigned char *output_ptr, |
- unsigned int out_pitch, |
- unsigned int output_height, |
- int16_t *filter) { |
+void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
+ ptrdiff_t src_pitch, |
+ uint8_t *output_ptr, |
+ ptrdiff_t out_pitch, |
+ uint32_t output_height, |
+ const int16_t *filter) { |
__m128i addFilterReg64, filtersReg, minReg; |
__m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
@@ -321,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, |
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
- filtersReg = _mm_loadu_si128((__m128i *)filter); |
+ filtersReg = _mm_loadu_si128((const __m128i *)filter); |
// converting the 16 bit (short) to 8 bit (byte) and have the same data |
// in both lanes of 128 bit register. |
filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
@@ -336,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, |
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
// load the first 7 rows of 8 bytes |
- srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); |
- srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); |
- srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); |
- srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); |
- srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); |
- srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); |
- srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); |
+ srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); |
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); |
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); |
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); |
+ srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); |
for (i = 0; i < output_height; i++) { |
// load the last 8 bytes |
- srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); |
+ srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); |
// merge the result together |
srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
@@ -394,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, |
} |
} |
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, |
- unsigned int src_pitch, |
- unsigned char *output_ptr, |
- unsigned int out_pitch, |
- unsigned int output_height, |
- int16_t *filter) { |
+static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, |
+ ptrdiff_t src_pitch, |
+ uint8_t *output_ptr, |
+ ptrdiff_t out_pitch, |
+ uint32_t output_height, |
+ const int16_t *filter) { |
__m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; |
__m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
__m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; |
@@ -409,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, |
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
- filtersReg = _mm_loadu_si128((__m128i *)filter); |
+ filtersReg = _mm_loadu_si128((const __m128i *)filter); |
// converting the 16 bit (short) to 8 bit (byte) and have the same data |
// in both lanes of 128 bit register. |
filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
@@ -424,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, |
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
// load the first 7 rows of 16 bytes |
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); |
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); |
- srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); |
- srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); |
- srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); |
- srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); |
- srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); |
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); |
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); |
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); |
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); |
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); |
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); |
+ srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); |
for (i = 0; i < output_height; i++) { |
// load the last 16 bytes |
- srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); |
+ srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); |
// merge the result together |
srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
@@ -508,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, |
output_ptr+=out_pitch; |
} |
} |
+ |
+#if ARCH_X86_64 |
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; |
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 |
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 |
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 |
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 |
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 |
+#else // ARCH_X86 |
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
+#endif // ARCH_X86_64 |
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
+ |
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; |
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; |
+ |
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
+// uint8_t *dst, ptrdiff_t dst_stride, |
+// const int16_t *filter_x, int x_step_q4, |
+// const int16_t *filter_y, int y_step_q4, |
+// int w, int h); |
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
+// uint8_t *dst, ptrdiff_t dst_stride, |
+// const int16_t *filter_x, int x_step_q4, |
+// const int16_t *filter_y, int y_step_q4, |
+// int w, int h); |
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
+// uint8_t *dst, ptrdiff_t dst_stride, |
+// const int16_t *filter_x, int x_step_q4, |
+// const int16_t *filter_y, int y_step_q4, |
+// int w, int h); |
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
+// uint8_t *dst, ptrdiff_t dst_stride, |
+// const int16_t *filter_x, int x_step_q4, |
+// const int16_t *filter_y, int y_step_q4, |
+// int w, int h); |
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); |
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); |
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); |
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, |
+ ssse3); |
+ |
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
+// uint8_t *dst, ptrdiff_t dst_stride, |
+// const int16_t *filter_x, int x_step_q4, |
+// const int16_t *filter_y, int y_step_q4, |
+// int w, int h); |
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
+// uint8_t *dst, ptrdiff_t dst_stride, |
+// const int16_t *filter_x, int x_step_q4, |
+// const int16_t *filter_y, int y_step_q4, |
+// int w, int h); |
+FUN_CONV_2D(, ssse3); |
+FUN_CONV_2D(avg_ , ssse3); |