| Index: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
|
| diff --git a/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
|
| index 71dbb402dd43b0c99991c37a31878199b393706a..5fd2857e14036710bf8006d746636c5a450df13d 100644
|
| --- a/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
|
| +++ b/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
|
| @@ -8,7 +8,14 @@
|
| * be found in the AUTHORS file in the root of the source tree.
|
| */
|
|
|
| +// Due to a header conflict between math.h and intrinsics includes with ceil()
|
| +// in certain configurations under vs9 this include needs to precede
|
| +// tmmintrin.h.
|
| +#include "./vp9_rtcd.h"
|
| +
|
| #include <tmmintrin.h>
|
| +
|
| +#include "vp9/common/x86/convolve.h"
|
| #include "vpx_ports/mem.h"
|
| #include "vpx_ports/emmintrin_compat.h"
|
|
|
| @@ -38,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
|
| 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
|
| };
|
|
|
| -void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
|
| - unsigned int src_pixels_per_line,
|
| - unsigned char *output_ptr,
|
| - unsigned int output_pitch,
|
| - unsigned int output_height,
|
| - int16_t *filter) {
|
| +// These are reused by the avx2 intrinsics.
|
| +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
|
| +
|
| +void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
|
| + ptrdiff_t src_pixels_per_line,
|
| + uint8_t *output_ptr,
|
| + ptrdiff_t output_pitch,
|
| + uint32_t output_height,
|
| + const int16_t *filter) {
|
| __m128i firstFilters, secondFilters, shuffle1, shuffle2;
|
| __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
|
| __m128i addFilterReg64, filtersReg, srcReg, minReg;
|
| @@ -51,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
|
| // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
| addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
|
| - filtersReg = _mm_loadu_si128((__m128i *)filter);
|
| + filtersReg = _mm_loadu_si128((const __m128i *)filter);
|
| // converting the 16 bit (short) to 8 bit (byte) and have the same data
|
| // in both lanes of 128 bit register.
|
| filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
| @@ -72,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
|
| shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
|
|
|
| for (i = 0; i < output_height; i++) {
|
| - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
|
| + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
|
|
|
| // filter the source buffer
|
| srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
|
| @@ -109,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
|
| }
|
| }
|
|
|
| -void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
|
| - unsigned int src_pixels_per_line,
|
| - unsigned char *output_ptr,
|
| - unsigned int output_pitch,
|
| - unsigned int output_height,
|
| - int16_t *filter) {
|
| +void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
|
| + ptrdiff_t src_pixels_per_line,
|
| + uint8_t *output_ptr,
|
| + ptrdiff_t output_pitch,
|
| + uint32_t output_height,
|
| + const int16_t *filter) {
|
| __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
|
| __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
|
| __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
|
| @@ -123,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
|
| // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
| addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
| - filtersReg = _mm_loadu_si128((__m128i *)filter);
|
| + filtersReg = _mm_loadu_si128((const __m128i *)filter);
|
| // converting the 16 bit (short) to 8 bit (byte) and have the same data
|
| // in both lanes of 128 bit register.
|
| filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
| @@ -147,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
|
| filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
|
|
|
| for (i = 0; i < output_height; i++) {
|
| - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
|
| + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
|
|
|
| // filter the source buffer
|
| srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
|
| @@ -189,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
|
| }
|
| }
|
|
|
| -void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
|
| - unsigned int src_pixels_per_line,
|
| - unsigned char *output_ptr,
|
| - unsigned int output_pitch,
|
| - unsigned int output_height,
|
| - int16_t *filter) {
|
| +static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
|
| + ptrdiff_t src_pixels_per_line,
|
| + uint8_t *output_ptr,
|
| + ptrdiff_t output_pitch,
|
| + uint32_t output_height,
|
| + const int16_t *filter) {
|
| __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
|
| __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
|
| __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
|
| @@ -203,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
|
| // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
| addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
| - filtersReg = _mm_loadu_si128((__m128i *)filter);
|
| + filtersReg = _mm_loadu_si128((const __m128i *)filter);
|
| // converting the 16 bit (short) to 8 bit (byte) and have the same data
|
| // in both lanes of 128 bit register.
|
| filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
| @@ -227,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
|
| filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
|
|
|
| for (i = 0; i < output_height; i++) {
|
| - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
|
| + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
|
|
|
| // filter the source buffer
|
| srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
|
| @@ -254,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
|
|
|
| // reading the next 16 bytes.
|
| // (part of it was being read by earlier read)
|
| - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
|
| + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
|
|
|
| // add and saturate the results together
|
| srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
|
| @@ -306,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
|
| }
|
| }
|
|
|
| -void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
|
| - unsigned int src_pitch,
|
| - unsigned char *output_ptr,
|
| - unsigned int out_pitch,
|
| - unsigned int output_height,
|
| - int16_t *filter) {
|
| +void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
|
| + ptrdiff_t src_pitch,
|
| + uint8_t *output_ptr,
|
| + ptrdiff_t out_pitch,
|
| + uint32_t output_height,
|
| + const int16_t *filter) {
|
| __m128i addFilterReg64, filtersReg, minReg;
|
| __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
|
| __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
|
| @@ -321,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
|
|
|
| // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
| addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
| - filtersReg = _mm_loadu_si128((__m128i *)filter);
|
| + filtersReg = _mm_loadu_si128((const __m128i *)filter);
|
| // converting the 16 bit (short) to 8 bit (byte) and have the same data
|
| // in both lanes of 128 bit register.
|
| filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
| @@ -336,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
|
| forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
|
|
|
| // load the first 7 rows of 8 bytes
|
| - srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
|
| - srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
|
| - srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
|
| - srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
|
| - srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
|
| - srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
|
| - srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
|
| + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
|
| + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
|
| + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
|
| + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
|
| + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
|
| + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
|
| + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
|
|
|
| for (i = 0; i < output_height; i++) {
|
| // load the last 8 bytes
|
| - srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
|
| + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
|
|
|
| // merge the result together
|
| srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
|
| @@ -394,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
|
| }
|
| }
|
|
|
| -void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
|
| - unsigned int src_pitch,
|
| - unsigned char *output_ptr,
|
| - unsigned int out_pitch,
|
| - unsigned int output_height,
|
| - int16_t *filter) {
|
| +static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
|
| + ptrdiff_t src_pitch,
|
| + uint8_t *output_ptr,
|
| + ptrdiff_t out_pitch,
|
| + uint32_t output_height,
|
| + const int16_t *filter) {
|
| __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
|
| __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
|
| __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
|
| @@ -409,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
|
|
|
| // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
|
| addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
|
| - filtersReg = _mm_loadu_si128((__m128i *)filter);
|
| + filtersReg = _mm_loadu_si128((const __m128i *)filter);
|
| // converting the 16 bit (short) to 8 bit (byte) and have the same data
|
| // in both lanes of 128 bit register.
|
| filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
|
| @@ -424,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
|
| forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
|
|
|
| // load the first 7 rows of 16 bytes
|
| - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
|
| - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
|
| - srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
|
| - srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
|
| - srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
|
| - srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
|
| - srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
|
| + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
|
| + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
|
| + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
|
| + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
|
| + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
|
| + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
|
| + srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
|
|
|
| for (i = 0; i < output_height; i++) {
|
| // load the last 16 bytes
|
| - srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
|
| + srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
|
|
|
| // merge the result together
|
| srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
|
| @@ -508,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
|
| output_ptr+=out_pitch;
|
| }
|
| }
|
| +
|
| +#if ARCH_X86_64
|
| +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
|
| +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
|
| +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
|
| +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
|
| +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
|
| +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
|
| +#else // ARCH_X86
|
| +filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
|
| +#endif // ARCH_X86_64
|
| +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
|
| +
|
| +filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
|
| +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
|
| +
|
| +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
| +// uint8_t *dst, ptrdiff_t dst_stride,
|
| +// const int16_t *filter_x, int x_step_q4,
|
| +// const int16_t *filter_y, int y_step_q4,
|
| +// int w, int h);
|
| +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
| +// uint8_t *dst, ptrdiff_t dst_stride,
|
| +// const int16_t *filter_x, int x_step_q4,
|
| +// const int16_t *filter_y, int y_step_q4,
|
| +// int w, int h);
|
| +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
| +// uint8_t *dst, ptrdiff_t dst_stride,
|
| +// const int16_t *filter_x, int x_step_q4,
|
| +// const int16_t *filter_y, int y_step_q4,
|
| +// int w, int h);
|
| +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
| +// uint8_t *dst, ptrdiff_t dst_stride,
|
| +// const int16_t *filter_x, int x_step_q4,
|
| +// const int16_t *filter_y, int y_step_q4,
|
| +// int w, int h);
|
| +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
|
| +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
|
| +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
|
| +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
|
| + ssse3);
|
| +
|
| +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
| +// uint8_t *dst, ptrdiff_t dst_stride,
|
| +// const int16_t *filter_x, int x_step_q4,
|
| +// const int16_t *filter_y, int y_step_q4,
|
| +// int w, int h);
|
| +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
| +// uint8_t *dst, ptrdiff_t dst_stride,
|
| +// const int16_t *filter_x, int x_step_q4,
|
| +// const int16_t *filter_y, int y_step_q4,
|
| +// int w, int h);
|
| +FUN_CONV_2D(, ssse3);
|
| +FUN_CONV_2D(avg_ , ssse3);
|
|
|