Index: source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |
diff --git a/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |
index 48817581d214411c3735c6caa51319da65edd57a..8d5c7c2dd108641eef415d817faee0182e0e713b 100644 |
--- a/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |
+++ b/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |
@@ -203,125 +203,6 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, |
} |
} |
-#if ARCH_X86_64 |
-static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, |
- ptrdiff_t src_pixels_per_line, |
- uint8_t *output_ptr, |
- ptrdiff_t output_pitch, |
- uint32_t output_height, |
- const int16_t *filter) { |
- __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; |
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
- __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; |
- unsigned int i; |
- |
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
- filtersReg = _mm_loadu_si128((const __m128i *)filter); |
- // converting the 16 bit (short) to 8 bit (byte) and have the same data |
- // in both lanes of 128 bit register. |
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
- |
- // duplicate only the first 16 bits (first and second byte) |
- // across 128 bit register |
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
- // duplicate only the second 16 bits (third and forth byte) |
- // across 128 bit register |
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
- // duplicate only the third 16 bits (fifth and sixth byte) |
- // across 128 bit register |
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
- // duplicate only the forth 16 bits (seventh and eighth byte) |
- // across 128 bit register |
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
- |
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
- |
- for (i = 0; i < output_height; i++) { |
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
- |
- // filter the source buffer |
- srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); |
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); |
- |
- // multiply 2 adjacent elements with the filter and add the result |
- srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); |
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); |
- |
- // add and saturate the results together |
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
- |
- // filter the source buffer |
- srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); |
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); |
- |
- // multiply 2 adjacent elements with the filter and add the result |
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
- |
- // add and saturate the results together |
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
- _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
- |
- // reading the next 16 bytes. |
- // (part of it was being read by earlier read) |
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); |
- |
- // add and saturate the results together |
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
- _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
- |
- // filter the source buffer |
- srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); |
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); |
- |
- // multiply 2 adjacent elements with the filter and add the result |
- srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); |
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); |
- |
- // add and saturate the results together |
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
- |
- // filter the source buffer |
- srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); |
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); |
- |
- // multiply 2 adjacent elements with the filter and add the result |
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
- |
- // add and saturate the results together |
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
- _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
- _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
- |
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); |
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); |
- |
- // shift by 7 bit each 16 bit |
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); |
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); |
- |
- // shrink to 8 bit each 16 bits, the first lane contain the first |
- // convolve result and the second lane contain the second convolve |
- // result |
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); |
- |
- src_ptr+=src_pixels_per_line; |
- |
- // save 16 bytes |
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
- |
- output_ptr+=output_pitch; |
- } |
-} |
-#endif // ARCH_X86_64 |
- |
void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
ptrdiff_t src_pitch, |
uint8_t *output_ptr, |
@@ -527,26 +408,12 @@ static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, |
} |
#endif // ARCH_X86_64 |
-#if ARCH_X86_64 |
-filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; |
-filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; |
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; |
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; |
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3; |
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; |
-#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 |
-#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 |
-#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 |
-#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 |
-#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 |
-#else // ARCH_X86 |
filter8_1dfunction vpx_filter_block1d16_v8_ssse3; |
filter8_1dfunction vpx_filter_block1d16_h8_ssse3; |
filter8_1dfunction vpx_filter_block1d8_v8_ssse3; |
filter8_1dfunction vpx_filter_block1d8_h8_ssse3; |
filter8_1dfunction vpx_filter_block1d4_v8_ssse3; |
filter8_1dfunction vpx_filter_block1d4_h8_ssse3; |
-#endif // ARCH_X86_64 |
filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; |
filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; |
filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; |