| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 // Due to a header conflict between math.h and intrinsics includes with ceil() |
| 12 // in certain configurations under vs9 this include needs to precede |
| 13 // tmmintrin.h. |
| 14 #include "./vp9_rtcd.h" |
| 15 |
| 11 #include <tmmintrin.h> | 16 #include <tmmintrin.h> |
| 17 |
| 18 #include "vp9/common/x86/convolve.h" |
| 12 #include "vpx_ports/mem.h" | 19 #include "vpx_ports/mem.h" |
| 13 #include "vpx_ports/emmintrin_compat.h" | 20 #include "vpx_ports/emmintrin_compat.h" |
| 14 | 21 |
| 15 // filters only for the 4_h8 convolution | 22 // filters only for the 4_h8 convolution |
| 16 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { | 23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { |
| 17 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 | 24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 |
| 18 }; | 25 }; |
| 19 | 26 |
| 20 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { | 27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { |
| 21 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 | 28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 |
| 22 }; | 29 }; |
| 23 | 30 |
| 24 // filters for 8_h8 and 16_h8 | 31 // filters for 8_h8 and 16_h8 |
| 25 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { | 32 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { |
| 26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 33 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 27 }; | 34 }; |
| 28 | 35 |
| 29 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { | 36 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { |
| 30 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | 37 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| 31 }; | 38 }; |
| 32 | 39 |
| 33 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { | 40 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { |
| 34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 41 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
| 35 }; | 42 }; |
| 36 | 43 |
| 37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { | 44 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { |
| 38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 45 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
| 39 }; | 46 }; |
| 40 | 47 |
| 41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, | 48 // These are reused by the avx2 intrinsics. |
| 42 unsigned int src_pixels_per_line, | 49 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
| 43 unsigned char *output_ptr, | 50 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
| 44 unsigned int output_pitch, | 51 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
| 45 unsigned int output_height, | 52 |
| 46 int16_t *filter) { | 53 void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, |
| 54 ptrdiff_t src_pixels_per_line, |
| 55 uint8_t *output_ptr, |
| 56 ptrdiff_t output_pitch, |
| 57 uint32_t output_height, |
| 58 const int16_t *filter) { |
| 47 __m128i firstFilters, secondFilters, shuffle1, shuffle2; | 59 __m128i firstFilters, secondFilters, shuffle1, shuffle2; |
| 48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; | 60 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
| 49 __m128i addFilterReg64, filtersReg, srcReg, minReg; | 61 __m128i addFilterReg64, filtersReg, srcReg, minReg; |
| 50 unsigned int i; | 62 unsigned int i; |
| 51 | 63 |
| 52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 64 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); | 65 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); |
| 54 filtersReg = _mm_loadu_si128((__m128i *)filter); | 66 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 55 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 67 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 56 // in both lanes of 128 bit register. | 68 // in both lanes of 128 bit register. |
| 57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 69 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 58 | 70 |
| 59 // duplicate only the first 16 bits in the filter into the first lane | 71 // duplicate only the first 16 bits in the filter into the first lane |
| 60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | 72 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); |
| 61 // duplicate only the third 16 bit in the filter into the first lane | 73 // duplicate only the third 16 bit in the filter into the first lane |
| 62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | 74 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); |
| 63 // duplicate only the seconds 16 bits in the filter into the second lane | 75 // duplicate only the seconds 16 bits in the filter into the second lane |
| 64 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 | 76 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 |
| 65 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | 77 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); |
| 66 // duplicate only the forth 16 bits in the filter into the second lane | 78 // duplicate only the forth 16 bits in the filter into the second lane |
| 67 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 | 79 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 |
| 68 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | 80 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); |
| 69 | 81 |
| 70 // loading the local filters | 82 // loading the local filters |
| 71 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); | 83 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); |
| 72 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); | 84 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); |
| 73 | 85 |
| 74 for (i = 0; i < output_height; i++) { | 86 for (i = 0; i < output_height; i++) { |
| 75 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 87 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
| 76 | 88 |
| 77 // filter the source buffer | 89 // filter the source buffer |
| 78 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); | 90 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); |
| 79 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); | 91 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); |
| 80 | 92 |
| 81 // multiply 2 adjacent elements with the filter and add the result | 93 // multiply 2 adjacent elements with the filter and add the result |
| 82 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 94 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
| 83 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 95 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
| 84 | 96 |
| 85 // extract the higher half of the lane | 97 // extract the higher half of the lane |
| (...skipping 16 matching lines...) Expand all Loading... |
| 102 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | 114 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
| 103 src_ptr+=src_pixels_per_line; | 115 src_ptr+=src_pixels_per_line; |
| 104 | 116 |
| 105 // save only 4 bytes | 117 // save only 4 bytes |
| 106 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); | 118 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); |
| 107 | 119 |
| 108 output_ptr+=output_pitch; | 120 output_ptr+=output_pitch; |
| 109 } | 121 } |
| 110 } | 122 } |
| 111 | 123 |
| 112 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, | 124 void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, |
| 113 unsigned int src_pixels_per_line, | 125 ptrdiff_t src_pixels_per_line, |
| 114 unsigned char *output_ptr, | 126 uint8_t *output_ptr, |
| 115 unsigned int output_pitch, | 127 ptrdiff_t output_pitch, |
| 116 unsigned int output_height, | 128 uint32_t output_height, |
| 117 int16_t *filter) { | 129 const int16_t *filter) { |
| 118 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; | 130 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; |
| 119 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 131 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 120 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; | 132 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
| 121 __m128i addFilterReg64, filtersReg, minReg; | 133 __m128i addFilterReg64, filtersReg, minReg; |
| 122 unsigned int i; | 134 unsigned int i; |
| 123 | 135 |
| 124 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 136 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 125 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 137 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 126 filtersReg = _mm_loadu_si128((__m128i *)filter); | 138 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 127 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 139 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 128 // in both lanes of 128 bit register. | 140 // in both lanes of 128 bit register. |
| 129 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 141 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 130 | 142 |
| 131 // duplicate only the first 16 bits (first and second byte) | 143 // duplicate only the first 16 bits (first and second byte) |
| 132 // across 128 bit register | 144 // across 128 bit register |
| 133 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 145 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 134 // duplicate only the second 16 bits (third and forth byte) | 146 // duplicate only the second 16 bits (third and forth byte) |
| 135 // across 128 bit register | 147 // across 128 bit register |
| 136 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 148 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 137 // duplicate only the third 16 bits (fifth and sixth byte) | 149 // duplicate only the third 16 bits (fifth and sixth byte) |
| 138 // across 128 bit register | 150 // across 128 bit register |
| 139 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 151 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 140 // duplicate only the forth 16 bits (seventh and eighth byte) | 152 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 141 // across 128 bit register | 153 // across 128 bit register |
| 142 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 154 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 143 | 155 |
| 144 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | 156 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
| 145 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | 157 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
| 146 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | 158 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
| 147 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | 159 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
| 148 | 160 |
| 149 for (i = 0; i < output_height; i++) { | 161 for (i = 0; i < output_height; i++) { |
| 150 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 162 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
| 151 | 163 |
| 152 // filter the source buffer | 164 // filter the source buffer |
| 153 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); | 165 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); |
| 154 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); | 166 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); |
| 155 | 167 |
| 156 // multiply 2 adjacent elements with the filter and add the result | 168 // multiply 2 adjacent elements with the filter and add the result |
| 157 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 169 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
| 158 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 170 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
| 159 | 171 |
| 160 // filter the source buffer | 172 // filter the source buffer |
| (...skipping 21 matching lines...) Expand all Loading... |
| 182 | 194 |
| 183 src_ptr+=src_pixels_per_line; | 195 src_ptr+=src_pixels_per_line; |
| 184 | 196 |
| 185 // save only 8 bytes | 197 // save only 8 bytes |
| 186 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
| 187 | 199 |
| 188 output_ptr+=output_pitch; | 200 output_ptr+=output_pitch; |
| 189 } | 201 } |
| 190 } | 202 } |
| 191 | 203 |
| 192 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, | 204 static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, |
| 193 unsigned int src_pixels_per_line, | 205 ptrdiff_t src_pixels_per_line, |
| 194 unsigned char *output_ptr, | 206 uint8_t *output_ptr, |
| 195 unsigned int output_pitch, | 207 ptrdiff_t output_pitch, |
| 196 unsigned int output_height, | 208 uint32_t output_height, |
| 197 int16_t *filter) { | 209 const int16_t *filter) { |
| 198 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; | 210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; |
| 199 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
| 200 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 201 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | 213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; |
| 202 unsigned int i; | 214 unsigned int i; |
| 203 | 215 |
| 204 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 216 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 205 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 217 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 206 filtersReg = _mm_loadu_si128((__m128i *)filter); | 218 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 207 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 219 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 208 // in both lanes of 128 bit register. | 220 // in both lanes of 128 bit register. |
| 209 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 221 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 210 | 222 |
| 211 // duplicate only the first 16 bits (first and second byte) | 223 // duplicate only the first 16 bits (first and second byte) |
| 212 // across 128 bit register | 224 // across 128 bit register |
| 213 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 225 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 214 // duplicate only the second 16 bits (third and forth byte) | 226 // duplicate only the second 16 bits (third and forth byte) |
| 215 // across 128 bit register | 227 // across 128 bit register |
| 216 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 228 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 217 // duplicate only the third 16 bits (fifth and sixth byte) | 229 // duplicate only the third 16 bits (fifth and sixth byte) |
| 218 // across 128 bit register | 230 // across 128 bit register |
| 219 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 231 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 220 // duplicate only the forth 16 bits (seventh and eighth byte) | 232 // duplicate only the forth 16 bits (seventh and eighth byte) |
| 221 // across 128 bit register | 233 // across 128 bit register |
| 222 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 234 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 223 | 235 |
| 224 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | 236 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
| 225 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | 237 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
| 226 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | 238 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
| 227 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | 239 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
| 228 | 240 |
| 229 for (i = 0; i < output_height; i++) { | 241 for (i = 0; i < output_height; i++) { |
| 230 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 242 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
| 231 | 243 |
| 232 // filter the source buffer | 244 // filter the source buffer |
| 233 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); | 245 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); |
| 234 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); | 246 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); |
| 235 | 247 |
| 236 // multiply 2 adjacent elements with the filter and add the result | 248 // multiply 2 adjacent elements with the filter and add the result |
| 237 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | 249 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); |
| 238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); | 250 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); |
| 239 | 251 |
| 240 // add and saturate the results together | 252 // add and saturate the results together |
| 241 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | 253 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
| 242 | 254 |
| 243 // filter the source buffer | 255 // filter the source buffer |
| 244 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); | 256 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); |
| 245 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); | 257 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); |
| 246 | 258 |
| 247 // multiply 2 adjacent elements with the filter and add the result | 259 // multiply 2 adjacent elements with the filter and add the result |
| 248 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | 260 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
| 249 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | 261 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
| 250 | 262 |
| 251 // add and saturate the results together | 263 // add and saturate the results together |
| 252 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 264 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 253 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 265 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 254 | 266 |
| 255 // reading the next 16 bytes. | 267 // reading the next 16 bytes. |
| 256 // (part of it was being read by earlier read) | 268 // (part of it was being read by earlier read) |
| 257 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 269 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); |
| 258 | 270 |
| 259 // add and saturate the results together | 271 // add and saturate the results together |
| 260 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 272 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 261 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 273 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 262 | 274 |
| 263 // filter the source buffer | 275 // filter the source buffer |
| 264 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); | 276 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); |
| 265 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); | 277 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); |
| 266 | 278 |
| 267 // multiply 2 adjacent elements with the filter and add the result | 279 // multiply 2 adjacent elements with the filter and add the result |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 299 | 311 |
| 300 src_ptr+=src_pixels_per_line; | 312 src_ptr+=src_pixels_per_line; |
| 301 | 313 |
| 302 // save 16 bytes | 314 // save 16 bytes |
| 303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | 315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
| 304 | 316 |
| 305 output_ptr+=output_pitch; | 317 output_ptr+=output_pitch; |
| 306 } | 318 } |
| 307 } | 319 } |
| 308 | 320 |
| 309 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, | 321 void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
| 310 unsigned int src_pitch, | 322 ptrdiff_t src_pitch, |
| 311 unsigned char *output_ptr, | 323 uint8_t *output_ptr, |
| 312 unsigned int out_pitch, | 324 ptrdiff_t out_pitch, |
| 313 unsigned int output_height, | 325 uint32_t output_height, |
| 314 int16_t *filter) { | 326 const int16_t *filter) { |
| 315 __m128i addFilterReg64, filtersReg, minReg; | 327 __m128i addFilterReg64, filtersReg, minReg; |
| 316 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 317 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; | 329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
| 318 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 330 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
| 319 __m128i srcReg8; | 331 __m128i srcReg8; |
| 320 unsigned int i; | 332 unsigned int i; |
| 321 | 333 |
| 322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 334 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 323 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 335 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 324 filtersReg = _mm_loadu_si128((__m128i *)filter); | 336 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 325 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 337 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 326 // in both lanes of 128 bit register. | 338 // in both lanes of 128 bit register. |
| 327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 339 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 328 | 340 |
| 329 // duplicate only the first 16 bits in the filter | 341 // duplicate only the first 16 bits in the filter |
| 330 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 342 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 331 // duplicate only the second 16 bits in the filter | 343 // duplicate only the second 16 bits in the filter |
| 332 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 344 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 333 // duplicate only the third 16 bits in the filter | 345 // duplicate only the third 16 bits in the filter |
| 334 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 346 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 335 // duplicate only the forth 16 bits in the filter | 347 // duplicate only the forth 16 bits in the filter |
| 336 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 348 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 337 | 349 |
| 338 // load the first 7 rows of 8 bytes | 350 // load the first 7 rows of 8 bytes |
| 339 srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); | 351 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); |
| 340 srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); | 352 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); |
| 341 srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); | 353 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
| 342 srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); | 354 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
| 343 srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); | 355 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); |
| 344 srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); | 356 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); |
| 345 srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); | 357 srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); |
| 346 | 358 |
| 347 for (i = 0; i < output_height; i++) { | 359 for (i = 0; i < output_height; i++) { |
| 348 // load the last 8 bytes | 360 // load the last 8 bytes |
| 349 srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); | 361 srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); |
| 350 | 362 |
| 351 // merge the result together | 363 // merge the result together |
| 352 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); | 364 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
| 353 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); | 365 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); |
| 354 | 366 |
| 355 // merge the result together | 367 // merge the result together |
| 356 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); | 368 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); |
| 357 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); | 369 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); |
| 358 | 370 |
| 359 // multiply 2 adjacent elements with the filter and add the result | 371 // multiply 2 adjacent elements with the filter and add the result |
| (...skipping 27 matching lines...) Expand all Loading... |
| 387 srcReg6 = srcReg7; | 399 srcReg6 = srcReg7; |
| 388 srcReg7 = srcReg8; | 400 srcReg7 = srcReg8; |
| 389 | 401 |
| 390 // save only 8 bytes convolve result | 402 // save only 8 bytes convolve result |
| 391 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
| 392 | 404 |
| 393 output_ptr+=out_pitch; | 405 output_ptr+=out_pitch; |
| 394 } | 406 } |
| 395 } | 407 } |
| 396 | 408 |
| 397 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, | 409 static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, |
| 398 unsigned int src_pitch, | 410 ptrdiff_t src_pitch, |
| 399 unsigned char *output_ptr, | 411 uint8_t *output_ptr, |
| 400 unsigned int out_pitch, | 412 ptrdiff_t out_pitch, |
| 401 unsigned int output_height, | 413 uint32_t output_height, |
| 402 int16_t *filter) { | 414 const int16_t *filter) { |
| 403 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; | 415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; |
| 404 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
| 405 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; | 417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; |
| 406 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
| 407 __m128i srcReg8; | 419 __m128i srcReg8; |
| 408 unsigned int i; | 420 unsigned int i; |
| 409 | 421 |
| 410 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 422 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
| 411 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 423 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
| 412 filtersReg = _mm_loadu_si128((__m128i *)filter); | 424 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
| 413 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 425 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
| 414 // in both lanes of 128 bit register. | 426 // in both lanes of 128 bit register. |
| 415 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 427 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
| 416 | 428 |
| 417 // duplicate only the first 16 bits in the filter | 429 // duplicate only the first 16 bits in the filter |
| 418 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 430 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
| 419 // duplicate only the second 16 bits in the filter | 431 // duplicate only the second 16 bits in the filter |
| 420 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 432 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
| 421 // duplicate only the third 16 bits in the filter | 433 // duplicate only the third 16 bits in the filter |
| 422 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 434 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
| 423 // duplicate only the forth 16 bits in the filter | 435 // duplicate only the forth 16 bits in the filter |
| 424 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 436 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
| 425 | 437 |
| 426 // load the first 7 rows of 16 bytes | 438 // load the first 7 rows of 16 bytes |
| 427 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); | 439 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); |
| 428 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); | 440 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); |
| 429 srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); | 441 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); |
| 430 srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); | 442 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); |
| 431 srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); | 443 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); |
| 432 srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); | 444 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); |
| 433 srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); | 445 srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); |
| 434 | 446 |
| 435 for (i = 0; i < output_height; i++) { | 447 for (i = 0; i < output_height; i++) { |
| 436 // load the last 16 bytes | 448 // load the last 16 bytes |
| 437 srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); | 449 srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); |
| 438 | 450 |
| 439 // merge the result together | 451 // merge the result together |
| 440 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); | 452 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
| 441 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); | 453 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); |
| 442 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); | 454 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); |
| 443 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); | 455 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); |
| 444 | 456 |
| 445 // multiply 2 adjacent elements with the filter and add the result | 457 // multiply 2 adjacent elements with the filter and add the result |
| 446 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); | 458 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); |
| 447 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); | 459 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 501 srcReg5 = srcReg6; | 513 srcReg5 = srcReg6; |
| 502 srcReg6 = srcReg7; | 514 srcReg6 = srcReg7; |
| 503 srcReg7 = srcReg8; | 515 srcReg7 = srcReg8; |
| 504 | 516 |
| 505 // save 16 bytes convolve result | 517 // save 16 bytes convolve result |
| 506 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 507 | 519 |
| 508 output_ptr+=out_pitch; | 520 output_ptr+=out_pitch; |
| 509 } | 521 } |
| 510 } | 522 } |
| 523 |
| 524 #if ARCH_X86_64 |
| 525 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; |
| 526 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; |
| 527 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
| 528 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
| 529 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 530 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
| 531 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 |
| 532 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 |
| 533 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 |
| 534 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 |
| 535 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 |
| 536 #else // ARCH_X86 |
| 537 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
| 538 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
| 539 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 540 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 541 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 542 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 543 #endif // ARCH_X86_64 |
| 544 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
| 545 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
| 546 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
| 547 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
| 548 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
| 549 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
| 550 |
| 551 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 552 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 553 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 554 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 555 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 556 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 557 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; |
| 558 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; |
| 559 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; |
| 560 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; |
| 561 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; |
| 562 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; |
| 563 |
| 564 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 565 // uint8_t *dst, ptrdiff_t dst_stride, |
| 566 // const int16_t *filter_x, int x_step_q4, |
| 567 // const int16_t *filter_y, int y_step_q4, |
| 568 // int w, int h); |
| 569 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 570 // uint8_t *dst, ptrdiff_t dst_stride, |
| 571 // const int16_t *filter_x, int x_step_q4, |
| 572 // const int16_t *filter_y, int y_step_q4, |
| 573 // int w, int h); |
| 574 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 575 // uint8_t *dst, ptrdiff_t dst_stride, |
| 576 // const int16_t *filter_x, int x_step_q4, |
| 577 // const int16_t *filter_y, int y_step_q4, |
| 578 // int w, int h); |
| 579 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 580 // uint8_t *dst, ptrdiff_t dst_stride, |
| 581 // const int16_t *filter_x, int x_step_q4, |
| 582 // const int16_t *filter_y, int y_step_q4, |
| 583 // int w, int h); |
| 584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); |
| 585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); |
| 586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); |
| 587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, |
| 588 ssse3); |
| 589 |
| 590 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 591 // uint8_t *dst, ptrdiff_t dst_stride, |
| 592 // const int16_t *filter_x, int x_step_q4, |
| 593 // const int16_t *filter_y, int y_step_q4, |
| 594 // int w, int h); |
| 595 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 596 // uint8_t *dst, ptrdiff_t dst_stride, |
| 597 // const int16_t *filter_x, int x_step_q4, |
| 598 // const int16_t *filter_y, int y_step_q4, |
| 599 // int w, int h); |
| 600 FUN_CONV_2D(, ssse3); |
| 601 FUN_CONV_2D(avg_ , ssse3); |
| OLD | NEW |