OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 // Due to a header conflict between math.h and intrinsics includes with ceil() |
| 12 // in certain configurations under vs9 this include needs to precede |
| 13 // tmmintrin.h. |
| 14 #include "./vp9_rtcd.h" |
| 15 |
11 #include <tmmintrin.h> | 16 #include <tmmintrin.h> |
| 17 |
| 18 #include "vp9/common/x86/convolve.h" |
12 #include "vpx_ports/mem.h" | 19 #include "vpx_ports/mem.h" |
13 #include "vpx_ports/emmintrin_compat.h" | 20 #include "vpx_ports/emmintrin_compat.h" |
14 | 21 |
15 // filters only for the 4_h8 convolution | 22 // filters only for the 4_h8 convolution |
16 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { | 23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { |
17 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 | 24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 |
18 }; | 25 }; |
19 | 26 |
20 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { | 27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { |
21 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 | 28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 |
22 }; | 29 }; |
23 | 30 |
24 // filters for 8_h8 and 16_h8 | 31 // filters for 8_h8 and 16_h8 |
25 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { | 32 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { |
26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | 33 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
27 }; | 34 }; |
28 | 35 |
29 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { | 36 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { |
30 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | 37 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
31 }; | 38 }; |
32 | 39 |
33 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { | 40 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { |
34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | 41 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
35 }; | 42 }; |
36 | 43 |
37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { | 44 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { |
38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 45 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
39 }; | 46 }; |
40 | 47 |
41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, | 48 // These are reused by the avx2 intrinsics. |
42 unsigned int src_pixels_per_line, | 49 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
43 unsigned char *output_ptr, | 50 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
44 unsigned int output_pitch, | 51 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
45 unsigned int output_height, | 52 |
46 int16_t *filter) { | 53 void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, |
| 54 ptrdiff_t src_pixels_per_line, |
| 55 uint8_t *output_ptr, |
| 56 ptrdiff_t output_pitch, |
| 57 uint32_t output_height, |
| 58 const int16_t *filter) { |
47 __m128i firstFilters, secondFilters, shuffle1, shuffle2; | 59 __m128i firstFilters, secondFilters, shuffle1, shuffle2; |
48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; | 60 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
49 __m128i addFilterReg64, filtersReg, srcReg, minReg; | 61 __m128i addFilterReg64, filtersReg, srcReg, minReg; |
50 unsigned int i; | 62 unsigned int i; |
51 | 63 |
52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 64 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); | 65 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); |
54 filtersReg = _mm_loadu_si128((__m128i *)filter); | 66 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
55 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 67 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
56 // in both lanes of 128 bit register. | 68 // in both lanes of 128 bit register. |
57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 69 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
58 | 70 |
59 // duplicate only the first 16 bits in the filter into the first lane | 71 // duplicate only the first 16 bits in the filter into the first lane |
60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | 72 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); |
61 // duplicate only the third 16 bit in the filter into the first lane | 73 // duplicate only the third 16 bit in the filter into the first lane |
62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | 74 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); |
63 // duplicate only the seconds 16 bits in the filter into the second lane | 75 // duplicate only the seconds 16 bits in the filter into the second lane |
64 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 | 76 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 |
65 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | 77 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); |
66 // duplicate only the forth 16 bits in the filter into the second lane | 78 // duplicate only the forth 16 bits in the filter into the second lane |
67 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 | 79 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 |
68 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | 80 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); |
69 | 81 |
70 // loading the local filters | 82 // loading the local filters |
71 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); | 83 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); |
72 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); | 84 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); |
73 | 85 |
74 for (i = 0; i < output_height; i++) { | 86 for (i = 0; i < output_height; i++) { |
75 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 87 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
76 | 88 |
77 // filter the source buffer | 89 // filter the source buffer |
78 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); | 90 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); |
79 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); | 91 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); |
80 | 92 |
81 // multiply 2 adjacent elements with the filter and add the result | 93 // multiply 2 adjacent elements with the filter and add the result |
82 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 94 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
83 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 95 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
84 | 96 |
85 // extract the higher half of the lane | 97 // extract the higher half of the lane |
(...skipping 16 matching lines...) Expand all Loading... |
102 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | 114 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
103 src_ptr+=src_pixels_per_line; | 115 src_ptr+=src_pixels_per_line; |
104 | 116 |
105 // save only 4 bytes | 117 // save only 4 bytes |
106 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); | 118 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); |
107 | 119 |
108 output_ptr+=output_pitch; | 120 output_ptr+=output_pitch; |
109 } | 121 } |
110 } | 122 } |
111 | 123 |
112 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, | 124 void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, |
113 unsigned int src_pixels_per_line, | 125 ptrdiff_t src_pixels_per_line, |
114 unsigned char *output_ptr, | 126 uint8_t *output_ptr, |
115 unsigned int output_pitch, | 127 ptrdiff_t output_pitch, |
116 unsigned int output_height, | 128 uint32_t output_height, |
117 int16_t *filter) { | 129 const int16_t *filter) { |
118 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; | 130 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; |
119 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 131 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
120 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; | 132 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
121 __m128i addFilterReg64, filtersReg, minReg; | 133 __m128i addFilterReg64, filtersReg, minReg; |
122 unsigned int i; | 134 unsigned int i; |
123 | 135 |
124 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 136 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
125 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 137 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
126 filtersReg = _mm_loadu_si128((__m128i *)filter); | 138 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
127 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 139 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
128 // in both lanes of 128 bit register. | 140 // in both lanes of 128 bit register. |
129 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 141 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
130 | 142 |
131 // duplicate only the first 16 bits (first and second byte) | 143 // duplicate only the first 16 bits (first and second byte) |
132 // across 128 bit register | 144 // across 128 bit register |
133 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 145 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
134 // duplicate only the second 16 bits (third and forth byte) | 146 // duplicate only the second 16 bits (third and forth byte) |
135 // across 128 bit register | 147 // across 128 bit register |
136 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 148 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
137 // duplicate only the third 16 bits (fifth and sixth byte) | 149 // duplicate only the third 16 bits (fifth and sixth byte) |
138 // across 128 bit register | 150 // across 128 bit register |
139 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 151 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
140 // duplicate only the forth 16 bits (seventh and eighth byte) | 152 // duplicate only the forth 16 bits (seventh and eighth byte) |
141 // across 128 bit register | 153 // across 128 bit register |
142 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 154 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
143 | 155 |
144 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | 156 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
145 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | 157 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
146 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | 158 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
147 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | 159 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
148 | 160 |
149 for (i = 0; i < output_height; i++) { | 161 for (i = 0; i < output_height; i++) { |
150 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 162 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
151 | 163 |
152 // filter the source buffer | 164 // filter the source buffer |
153 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); | 165 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); |
154 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); | 166 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); |
155 | 167 |
156 // multiply 2 adjacent elements with the filter and add the result | 168 // multiply 2 adjacent elements with the filter and add the result |
157 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 169 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
158 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 170 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
159 | 171 |
160 // filter the source buffer | 172 // filter the source buffer |
(...skipping 21 matching lines...) Expand all Loading... |
182 | 194 |
183 src_ptr+=src_pixels_per_line; | 195 src_ptr+=src_pixels_per_line; |
184 | 196 |
185 // save only 8 bytes | 197 // save only 8 bytes |
186 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
187 | 199 |
188 output_ptr+=output_pitch; | 200 output_ptr+=output_pitch; |
189 } | 201 } |
190 } | 202 } |
191 | 203 |
192 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, | 204 static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, |
193 unsigned int src_pixels_per_line, | 205 ptrdiff_t src_pixels_per_line, |
194 unsigned char *output_ptr, | 206 uint8_t *output_ptr, |
195 unsigned int output_pitch, | 207 ptrdiff_t output_pitch, |
196 unsigned int output_height, | 208 uint32_t output_height, |
197 int16_t *filter) { | 209 const int16_t *filter) { |
198 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; | 210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; |
199 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | 211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; |
200 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
201 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | 213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; |
202 unsigned int i; | 214 unsigned int i; |
203 | 215 |
204 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 216 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
205 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 217 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
206 filtersReg = _mm_loadu_si128((__m128i *)filter); | 218 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
207 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 219 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
208 // in both lanes of 128 bit register. | 220 // in both lanes of 128 bit register. |
209 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 221 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
210 | 222 |
211 // duplicate only the first 16 bits (first and second byte) | 223 // duplicate only the first 16 bits (first and second byte) |
212 // across 128 bit register | 224 // across 128 bit register |
213 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 225 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
214 // duplicate only the second 16 bits (third and forth byte) | 226 // duplicate only the second 16 bits (third and forth byte) |
215 // across 128 bit register | 227 // across 128 bit register |
216 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 228 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
217 // duplicate only the third 16 bits (fifth and sixth byte) | 229 // duplicate only the third 16 bits (fifth and sixth byte) |
218 // across 128 bit register | 230 // across 128 bit register |
219 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 231 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
220 // duplicate only the forth 16 bits (seventh and eighth byte) | 232 // duplicate only the forth 16 bits (seventh and eighth byte) |
221 // across 128 bit register | 233 // across 128 bit register |
222 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 234 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
223 | 235 |
224 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | 236 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
225 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | 237 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
226 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | 238 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
227 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | 239 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
228 | 240 |
229 for (i = 0; i < output_height; i++) { | 241 for (i = 0; i < output_height; i++) { |
230 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 242 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); |
231 | 243 |
232 // filter the source buffer | 244 // filter the source buffer |
233 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); | 245 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); |
234 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); | 246 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); |
235 | 247 |
236 // multiply 2 adjacent elements with the filter and add the result | 248 // multiply 2 adjacent elements with the filter and add the result |
237 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | 249 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); |
238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); | 250 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); |
239 | 251 |
240 // add and saturate the results together | 252 // add and saturate the results together |
241 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | 253 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
242 | 254 |
243 // filter the source buffer | 255 // filter the source buffer |
244 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); | 256 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); |
245 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); | 257 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); |
246 | 258 |
247 // multiply 2 adjacent elements with the filter and add the result | 259 // multiply 2 adjacent elements with the filter and add the result |
248 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | 260 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
249 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | 261 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
250 | 262 |
251 // add and saturate the results together | 263 // add and saturate the results together |
252 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 264 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
253 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 265 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
254 | 266 |
255 // reading the next 16 bytes. | 267 // reading the next 16 bytes. |
256 // (part of it was being read by earlier read) | 268 // (part of it was being read by earlier read) |
257 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 269 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); |
258 | 270 |
259 // add and saturate the results together | 271 // add and saturate the results together |
260 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 272 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
261 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 273 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
262 | 274 |
263 // filter the source buffer | 275 // filter the source buffer |
264 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); | 276 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); |
265 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); | 277 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); |
266 | 278 |
267 // multiply 2 adjacent elements with the filter and add the result | 279 // multiply 2 adjacent elements with the filter and add the result |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
299 | 311 |
300 src_ptr+=src_pixels_per_line; | 312 src_ptr+=src_pixels_per_line; |
301 | 313 |
302 // save 16 bytes | 314 // save 16 bytes |
303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | 315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); |
304 | 316 |
305 output_ptr+=output_pitch; | 317 output_ptr+=output_pitch; |
306 } | 318 } |
307 } | 319 } |
308 | 320 |
309 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, | 321 void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
310 unsigned int src_pitch, | 322 ptrdiff_t src_pitch, |
311 unsigned char *output_ptr, | 323 uint8_t *output_ptr, |
312 unsigned int out_pitch, | 324 ptrdiff_t out_pitch, |
313 unsigned int output_height, | 325 uint32_t output_height, |
314 int16_t *filter) { | 326 const int16_t *filter) { |
315 __m128i addFilterReg64, filtersReg, minReg; | 327 __m128i addFilterReg64, filtersReg, minReg; |
316 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
317 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; | 329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
318 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 330 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
319 __m128i srcReg8; | 331 __m128i srcReg8; |
320 unsigned int i; | 332 unsigned int i; |
321 | 333 |
322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 334 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
323 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 335 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
324 filtersReg = _mm_loadu_si128((__m128i *)filter); | 336 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
325 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 337 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
326 // in both lanes of 128 bit register. | 338 // in both lanes of 128 bit register. |
327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 339 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
328 | 340 |
329 // duplicate only the first 16 bits in the filter | 341 // duplicate only the first 16 bits in the filter |
330 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 342 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
331 // duplicate only the second 16 bits in the filter | 343 // duplicate only the second 16 bits in the filter |
332 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 344 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
333 // duplicate only the third 16 bits in the filter | 345 // duplicate only the third 16 bits in the filter |
334 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 346 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
335 // duplicate only the forth 16 bits in the filter | 347 // duplicate only the forth 16 bits in the filter |
336 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 348 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
337 | 349 |
338 // load the first 7 rows of 8 bytes | 350 // load the first 7 rows of 8 bytes |
339 srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); | 351 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); |
340 srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); | 352 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); |
341 srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); | 353 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); |
342 srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); | 354 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); |
343 srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); | 355 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); |
344 srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); | 356 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); |
345 srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); | 357 srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); |
346 | 358 |
347 for (i = 0; i < output_height; i++) { | 359 for (i = 0; i < output_height; i++) { |
348 // load the last 8 bytes | 360 // load the last 8 bytes |
349 srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); | 361 srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); |
350 | 362 |
351 // merge the result together | 363 // merge the result together |
352 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); | 364 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
353 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); | 365 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); |
354 | 366 |
355 // merge the result together | 367 // merge the result together |
356 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); | 368 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); |
357 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); | 369 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); |
358 | 370 |
359 // multiply 2 adjacent elements with the filter and add the result | 371 // multiply 2 adjacent elements with the filter and add the result |
(...skipping 27 matching lines...) Expand all Loading... |
387 srcReg6 = srcReg7; | 399 srcReg6 = srcReg7; |
388 srcReg7 = srcReg8; | 400 srcReg7 = srcReg8; |
389 | 401 |
390 // save only 8 bytes convolve result | 402 // save only 8 bytes convolve result |
391 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
392 | 404 |
393 output_ptr+=out_pitch; | 405 output_ptr+=out_pitch; |
394 } | 406 } |
395 } | 407 } |
396 | 408 |
397 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, | 409 static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, |
398 unsigned int src_pitch, | 410 ptrdiff_t src_pitch, |
399 unsigned char *output_ptr, | 411 uint8_t *output_ptr, |
400 unsigned int out_pitch, | 412 ptrdiff_t out_pitch, |
401 unsigned int output_height, | 413 uint32_t output_height, |
402 int16_t *filter) { | 414 const int16_t *filter) { |
403 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; | 415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; |
404 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
405 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; | 417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; |
406 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
407 __m128i srcReg8; | 419 __m128i srcReg8; |
408 unsigned int i; | 420 unsigned int i; |
409 | 421 |
410 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 422 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
411 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | 423 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); |
412 filtersReg = _mm_loadu_si128((__m128i *)filter); | 424 filtersReg = _mm_loadu_si128((const __m128i *)filter); |
413 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 425 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
414 // in both lanes of 128 bit register. | 426 // in both lanes of 128 bit register. |
415 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 427 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
416 | 428 |
417 // duplicate only the first 16 bits in the filter | 429 // duplicate only the first 16 bits in the filter |
418 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | 430 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); |
419 // duplicate only the second 16 bits in the filter | 431 // duplicate only the second 16 bits in the filter |
420 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | 432 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); |
421 // duplicate only the third 16 bits in the filter | 433 // duplicate only the third 16 bits in the filter |
422 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | 434 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); |
423 // duplicate only the forth 16 bits in the filter | 435 // duplicate only the forth 16 bits in the filter |
424 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | 436 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); |
425 | 437 |
426 // load the first 7 rows of 16 bytes | 438 // load the first 7 rows of 16 bytes |
427 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); | 439 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); |
428 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); | 440 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); |
429 srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); | 441 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); |
430 srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); | 442 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); |
431 srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); | 443 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); |
432 srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); | 444 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); |
433 srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); | 445 srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); |
434 | 446 |
435 for (i = 0; i < output_height; i++) { | 447 for (i = 0; i < output_height; i++) { |
436 // load the last 16 bytes | 448 // load the last 16 bytes |
437 srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); | 449 srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); |
438 | 450 |
439 // merge the result together | 451 // merge the result together |
440 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); | 452 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); |
441 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); | 453 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); |
442 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); | 454 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); |
443 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); | 455 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); |
444 | 456 |
445 // multiply 2 adjacent elements with the filter and add the result | 457 // multiply 2 adjacent elements with the filter and add the result |
446 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); | 458 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); |
447 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); | 459 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
501 srcReg5 = srcReg6; | 513 srcReg5 = srcReg6; |
502 srcReg6 = srcReg7; | 514 srcReg6 = srcReg7; |
503 srcReg7 = srcReg8; | 515 srcReg7 = srcReg8; |
504 | 516 |
505 // save 16 bytes convolve result | 517 // save 16 bytes convolve result |
506 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
507 | 519 |
508 output_ptr+=out_pitch; | 520 output_ptr+=out_pitch; |
509 } | 521 } |
510 } | 522 } |
| 523 |
| 524 #if ARCH_X86_64 |
| 525 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; |
| 526 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; |
| 527 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; |
| 528 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; |
| 529 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 530 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; |
| 531 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 |
| 532 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 |
| 533 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 |
| 534 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 |
| 535 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 |
| 536 #else // ARCH_X86 |
| 537 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
| 538 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
| 539 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 540 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 541 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 542 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 543 #endif // ARCH_X86_64 |
| 544 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
| 545 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
| 546 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
| 547 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
| 548 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
| 549 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
| 550 |
| 551 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 552 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 553 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 554 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 555 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 556 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 557 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; |
| 558 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; |
| 559 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; |
| 560 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; |
| 561 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; |
| 562 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; |
| 563 |
| 564 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 565 // uint8_t *dst, ptrdiff_t dst_stride, |
| 566 // const int16_t *filter_x, int x_step_q4, |
| 567 // const int16_t *filter_y, int y_step_q4, |
| 568 // int w, int h); |
| 569 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 570 // uint8_t *dst, ptrdiff_t dst_stride, |
| 571 // const int16_t *filter_x, int x_step_q4, |
| 572 // const int16_t *filter_y, int y_step_q4, |
| 573 // int w, int h); |
| 574 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 575 // uint8_t *dst, ptrdiff_t dst_stride, |
| 576 // const int16_t *filter_x, int x_step_q4, |
| 577 // const int16_t *filter_y, int y_step_q4, |
| 578 // int w, int h); |
| 579 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 580 // uint8_t *dst, ptrdiff_t dst_stride, |
| 581 // const int16_t *filter_x, int x_step_q4, |
| 582 // const int16_t *filter_y, int y_step_q4, |
| 583 // int w, int h); |
| 584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); |
| 585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); |
| 586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); |
| 587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, |
| 588 ssse3); |
| 589 |
| 590 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 591 // uint8_t *dst, ptrdiff_t dst_stride, |
| 592 // const int16_t *filter_x, int x_step_q4, |
| 593 // const int16_t *filter_y, int y_step_q4, |
| 594 // int w, int h); |
| 595 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 596 // uint8_t *dst, ptrdiff_t dst_stride, |
| 597 // const int16_t *filter_x, int x_step_q4, |
| 598 // const int16_t *filter_y, int y_step_q4, |
| 599 // int w, int h); |
| 600 FUN_CONV_2D(, ssse3); |
| 601 FUN_CONV_2D(avg_ , ssse3); |
OLD | NEW |