Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(223)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede
13 // tmmintrin.h.
14 #include "./vp9_rtcd.h"
15
11 #include <tmmintrin.h> 16 #include <tmmintrin.h>
17
18 #include "vp9/common/x86/convolve.h"
12 #include "vpx_ports/mem.h" 19 #include "vpx_ports/mem.h"
13 #include "vpx_ports/emmintrin_compat.h" 20 #include "vpx_ports/emmintrin_compat.h"
14 21
15 // filters only for the 4_h8 convolution 22 // filters only for the 4_h8 convolution
16 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
17 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
18 }; 25 };
19 26
20 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
21 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
22 }; 29 };
23 30
24 // filters for 8_h8 and 16_h8 31 // filters for 8_h8 and 16_h8
25 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { 32 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 33 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
27 }; 34 };
28 35
29 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { 36 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
30 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 37 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
31 }; 38 };
32 39
33 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { 40 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 41 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
35 }; 42 };
36 43
37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 44 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 45 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
39 }; 46 };
40 47
41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, 48 // These are reused by the avx2 intrinsics.
42 unsigned int src_pixels_per_line, 49 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
43 unsigned char *output_ptr, 50 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
44 unsigned int output_pitch, 51 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
45 unsigned int output_height, 52
46 int16_t *filter) { 53 void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
54 ptrdiff_t src_pixels_per_line,
55 uint8_t *output_ptr,
56 ptrdiff_t output_pitch,
57 uint32_t output_height,
58 const int16_t *filter) {
47 __m128i firstFilters, secondFilters, shuffle1, shuffle2; 59 __m128i firstFilters, secondFilters, shuffle1, shuffle2;
48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 60 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
49 __m128i addFilterReg64, filtersReg, srcReg, minReg; 61 __m128i addFilterReg64, filtersReg, srcReg, minReg;
50 unsigned int i; 62 unsigned int i;
51 63
52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 64 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); 65 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
54 filtersReg = _mm_loadu_si128((__m128i *)filter); 66 filtersReg = _mm_loadu_si128((const __m128i *)filter);
55 // converting the 16 bit (short) to 8 bit (byte) and have the same data 67 // converting the 16 bit (short) to 8 bit (byte) and have the same data
56 // in both lanes of 128 bit register. 68 // in both lanes of 128 bit register.
57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 69 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
58 70
59 // duplicate only the first 16 bits in the filter into the first lane 71 // duplicate only the first 16 bits in the filter into the first lane
60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); 72 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
61 // duplicate only the third 16 bit in the filter into the first lane 73 // duplicate only the third 16 bit in the filter into the first lane
62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); 74 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
63 // duplicate only the seconds 16 bits in the filter into the second lane 75 // duplicate only the seconds 16 bits in the filter into the second lane
64 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 76 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
65 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); 77 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
66 // duplicate only the forth 16 bits in the filter into the second lane 78 // duplicate only the forth 16 bits in the filter into the second lane
67 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 79 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
68 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); 80 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
69 81
70 // loading the local filters 82 // loading the local filters
71 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); 83 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
72 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); 84 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
73 85
74 for (i = 0; i < output_height; i++) { 86 for (i = 0; i < output_height; i++) {
75 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); 87 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
76 88
77 // filter the source buffer 89 // filter the source buffer
78 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); 90 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
79 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); 91 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
80 92
81 // multiply 2 adjacent elements with the filter and add the result 93 // multiply 2 adjacent elements with the filter and add the result
82 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 94 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
83 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 95 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
84 96
85 // extract the higher half of the lane 97 // extract the higher half of the lane
(...skipping 16 matching lines...) Expand all
102 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 114 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
103 src_ptr+=src_pixels_per_line; 115 src_ptr+=src_pixels_per_line;
104 116
105 // save only 4 bytes 117 // save only 4 bytes
106 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); 118 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
107 119
108 output_ptr+=output_pitch; 120 output_ptr+=output_pitch;
109 } 121 }
110 } 122 }
111 123
112 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, 124 void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
113 unsigned int src_pixels_per_line, 125 ptrdiff_t src_pixels_per_line,
114 unsigned char *output_ptr, 126 uint8_t *output_ptr,
115 unsigned int output_pitch, 127 ptrdiff_t output_pitch,
116 unsigned int output_height, 128 uint32_t output_height,
117 int16_t *filter) { 129 const int16_t *filter) {
118 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; 130 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
119 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 131 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
120 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 132 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
121 __m128i addFilterReg64, filtersReg, minReg; 133 __m128i addFilterReg64, filtersReg, minReg;
122 unsigned int i; 134 unsigned int i;
123 135
124 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 136 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
125 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 137 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
126 filtersReg = _mm_loadu_si128((__m128i *)filter); 138 filtersReg = _mm_loadu_si128((const __m128i *)filter);
127 // converting the 16 bit (short) to 8 bit (byte) and have the same data 139 // converting the 16 bit (short) to 8 bit (byte) and have the same data
128 // in both lanes of 128 bit register. 140 // in both lanes of 128 bit register.
129 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 141 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
130 142
131 // duplicate only the first 16 bits (first and second byte) 143 // duplicate only the first 16 bits (first and second byte)
132 // across 128 bit register 144 // across 128 bit register
133 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 145 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
134 // duplicate only the second 16 bits (third and forth byte) 146 // duplicate only the second 16 bits (third and forth byte)
135 // across 128 bit register 147 // across 128 bit register
136 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 148 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
137 // duplicate only the third 16 bits (fifth and sixth byte) 149 // duplicate only the third 16 bits (fifth and sixth byte)
138 // across 128 bit register 150 // across 128 bit register
139 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 151 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
140 // duplicate only the forth 16 bits (seventh and eighth byte) 152 // duplicate only the forth 16 bits (seventh and eighth byte)
141 // across 128 bit register 153 // across 128 bit register
142 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 154 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
143 155
144 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 156 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
145 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 157 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
146 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 158 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
147 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 159 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
148 160
149 for (i = 0; i < output_height; i++) { 161 for (i = 0; i < output_height; i++) {
150 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); 162 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
151 163
152 // filter the source buffer 164 // filter the source buffer
153 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); 165 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
154 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); 166 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
155 167
156 // multiply 2 adjacent elements with the filter and add the result 168 // multiply 2 adjacent elements with the filter and add the result
157 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 169 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
158 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 170 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
159 171
160 // filter the source buffer 172 // filter the source buffer
(...skipping 21 matching lines...) Expand all
182 194
183 src_ptr+=src_pixels_per_line; 195 src_ptr+=src_pixels_per_line;
184 196
185 // save only 8 bytes 197 // save only 8 bytes
186 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
187 199
188 output_ptr+=output_pitch; 200 output_ptr+=output_pitch;
189 } 201 }
190 } 202 }
191 203
192 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, 204 static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
193 unsigned int src_pixels_per_line, 205 ptrdiff_t src_pixels_per_line,
194 unsigned char *output_ptr, 206 uint8_t *output_ptr,
195 unsigned int output_pitch, 207 ptrdiff_t output_pitch,
196 unsigned int output_height, 208 uint32_t output_height,
197 int16_t *filter) { 209 const int16_t *filter) {
198 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; 210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
199 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
200 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
201 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; 213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
202 unsigned int i; 214 unsigned int i;
203 215
204 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 216 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
205 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 217 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
206 filtersReg = _mm_loadu_si128((__m128i *)filter); 218 filtersReg = _mm_loadu_si128((const __m128i *)filter);
207 // converting the 16 bit (short) to 8 bit (byte) and have the same data 219 // converting the 16 bit (short) to 8 bit (byte) and have the same data
208 // in both lanes of 128 bit register. 220 // in both lanes of 128 bit register.
209 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 221 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
210 222
211 // duplicate only the first 16 bits (first and second byte) 223 // duplicate only the first 16 bits (first and second byte)
212 // across 128 bit register 224 // across 128 bit register
213 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 225 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
214 // duplicate only the second 16 bits (third and forth byte) 226 // duplicate only the second 16 bits (third and forth byte)
215 // across 128 bit register 227 // across 128 bit register
216 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 228 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
217 // duplicate only the third 16 bits (fifth and sixth byte) 229 // duplicate only the third 16 bits (fifth and sixth byte)
218 // across 128 bit register 230 // across 128 bit register
219 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 231 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
220 // duplicate only the forth 16 bits (seventh and eighth byte) 232 // duplicate only the forth 16 bits (seventh and eighth byte)
221 // across 128 bit register 233 // across 128 bit register
222 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 234 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
223 235
224 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 236 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
225 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 237 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
226 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 238 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
227 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 239 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
228 240
229 for (i = 0; i < output_height; i++) { 241 for (i = 0; i < output_height; i++) {
230 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); 242 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
231 243
232 // filter the source buffer 244 // filter the source buffer
233 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); 245 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
234 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); 246 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
235 247
236 // multiply 2 adjacent elements with the filter and add the result 248 // multiply 2 adjacent elements with the filter and add the result
237 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); 249 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); 250 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
239 251
240 // add and saturate the results together 252 // add and saturate the results together
241 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 253 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
242 254
243 // filter the source buffer 255 // filter the source buffer
244 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); 256 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
245 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); 257 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
246 258
247 // multiply 2 adjacent elements with the filter and add the result 259 // multiply 2 adjacent elements with the filter and add the result
248 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); 260 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
249 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 261 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
250 262
251 // add and saturate the results together 263 // add and saturate the results together
252 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 264 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
253 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 265 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
254 266
255 // reading the next 16 bytes. 267 // reading the next 16 bytes.
256 // (part of it was being read by earlier read) 268 // (part of it was being read by earlier read)
257 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); 269 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
258 270
259 // add and saturate the results together 271 // add and saturate the results together
260 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 272 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
261 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 273 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
262 274
263 // filter the source buffer 275 // filter the source buffer
264 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); 276 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
265 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); 277 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
266 278
267 // multiply 2 adjacent elements with the filter and add the result 279 // multiply 2 adjacent elements with the filter and add the result
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
299 311
300 src_ptr+=src_pixels_per_line; 312 src_ptr+=src_pixels_per_line;
301 313
302 // save 16 bytes 314 // save 16 bytes
303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
304 316
305 output_ptr+=output_pitch; 317 output_ptr+=output_pitch;
306 } 318 }
307 } 319 }
308 320
309 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, 321 void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
310 unsigned int src_pitch, 322 ptrdiff_t src_pitch,
311 unsigned char *output_ptr, 323 uint8_t *output_ptr,
312 unsigned int out_pitch, 324 ptrdiff_t out_pitch,
313 unsigned int output_height, 325 uint32_t output_height,
314 int16_t *filter) { 326 const int16_t *filter) {
315 __m128i addFilterReg64, filtersReg, minReg; 327 __m128i addFilterReg64, filtersReg, minReg;
316 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
317 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; 329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
318 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; 330 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
319 __m128i srcReg8; 331 __m128i srcReg8;
320 unsigned int i; 332 unsigned int i;
321 333
322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 334 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
323 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 335 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
324 filtersReg = _mm_loadu_si128((__m128i *)filter); 336 filtersReg = _mm_loadu_si128((const __m128i *)filter);
325 // converting the 16 bit (short) to 8 bit (byte) and have the same data 337 // converting the 16 bit (short) to 8 bit (byte) and have the same data
326 // in both lanes of 128 bit register. 338 // in both lanes of 128 bit register.
327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 339 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
328 340
329 // duplicate only the first 16 bits in the filter 341 // duplicate only the first 16 bits in the filter
330 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 342 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
331 // duplicate only the second 16 bits in the filter 343 // duplicate only the second 16 bits in the filter
332 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 344 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
333 // duplicate only the third 16 bits in the filter 345 // duplicate only the third 16 bits in the filter
334 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 346 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
335 // duplicate only the forth 16 bits in the filter 347 // duplicate only the forth 16 bits in the filter
336 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 348 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
337 349
338 // load the first 7 rows of 8 bytes 350 // load the first 7 rows of 8 bytes
339 srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); 351 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
340 srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); 352 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
341 srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); 353 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
342 srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); 354 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
343 srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); 355 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
344 srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); 356 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
345 srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); 357 srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
346 358
347 for (i = 0; i < output_height; i++) { 359 for (i = 0; i < output_height; i++) {
348 // load the last 8 bytes 360 // load the last 8 bytes
349 srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); 361 srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
350 362
351 // merge the result together 363 // merge the result together
352 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); 364 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
353 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); 365 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
354 366
355 // merge the result together 367 // merge the result together
356 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); 368 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
357 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); 369 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
358 370
359 // multiply 2 adjacent elements with the filter and add the result 371 // multiply 2 adjacent elements with the filter and add the result
(...skipping 27 matching lines...) Expand all
387 srcReg6 = srcReg7; 399 srcReg6 = srcReg7;
388 srcReg7 = srcReg8; 400 srcReg7 = srcReg8;
389 401
390 // save only 8 bytes convolve result 402 // save only 8 bytes convolve result
391 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
392 404
393 output_ptr+=out_pitch; 405 output_ptr+=out_pitch;
394 } 406 }
395 } 407 }
396 408
397 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, 409 static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
398 unsigned int src_pitch, 410 ptrdiff_t src_pitch,
399 unsigned char *output_ptr, 411 uint8_t *output_ptr,
400 unsigned int out_pitch, 412 ptrdiff_t out_pitch,
401 unsigned int output_height, 413 uint32_t output_height,
402 int16_t *filter) { 414 const int16_t *filter) {
403 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; 415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
404 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
405 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; 417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
406 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; 418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
407 __m128i srcReg8; 419 __m128i srcReg8;
408 unsigned int i; 420 unsigned int i;
409 421
410 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 422 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
411 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 423 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
412 filtersReg = _mm_loadu_si128((__m128i *)filter); 424 filtersReg = _mm_loadu_si128((const __m128i *)filter);
413 // converting the 16 bit (short) to 8 bit (byte) and have the same data 425 // converting the 16 bit (short) to 8 bit (byte) and have the same data
414 // in both lanes of 128 bit register. 426 // in both lanes of 128 bit register.
415 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 427 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
416 428
417 // duplicate only the first 16 bits in the filter 429 // duplicate only the first 16 bits in the filter
418 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 430 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
419 // duplicate only the second 16 bits in the filter 431 // duplicate only the second 16 bits in the filter
420 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 432 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
421 // duplicate only the third 16 bits in the filter 433 // duplicate only the third 16 bits in the filter
422 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 434 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
423 // duplicate only the forth 16 bits in the filter 435 // duplicate only the forth 16 bits in the filter
424 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 436 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
425 437
426 // load the first 7 rows of 16 bytes 438 // load the first 7 rows of 16 bytes
427 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); 439 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
428 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); 440 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
429 srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); 441 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
430 srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); 442 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
431 srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); 443 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
432 srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); 444 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
433 srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); 445 srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
434 446
435 for (i = 0; i < output_height; i++) { 447 for (i = 0; i < output_height; i++) {
436 // load the last 16 bytes 448 // load the last 16 bytes
437 srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); 449 srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
438 450
439 // merge the result together 451 // merge the result together
440 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); 452 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
441 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); 453 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
442 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); 454 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
443 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); 455 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
444 456
445 // multiply 2 adjacent elements with the filter and add the result 457 // multiply 2 adjacent elements with the filter and add the result
446 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); 458 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
447 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); 459 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
501 srcReg5 = srcReg6; 513 srcReg5 = srcReg6;
502 srcReg6 = srcReg7; 514 srcReg6 = srcReg7;
503 srcReg7 = srcReg8; 515 srcReg7 = srcReg8;
504 516
505 // save 16 bytes convolve result 517 // save 16 bytes convolve result
506 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
507 519
508 output_ptr+=out_pitch; 520 output_ptr+=out_pitch;
509 } 521 }
510 } 522 }
523
524 #if ARCH_X86_64
525 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
526 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
527 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
528 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
529 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
530 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
531 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
532 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
533 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
534 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
535 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
536 #else // ARCH_X86
537 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
538 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
539 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
540 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
541 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
542 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
543 #endif // ARCH_X86_64
544 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
545 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
546 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
547 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
548 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
549 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
550
551 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
552 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
553 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
554 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
555 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
556 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
557 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
558 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
559 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
560 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
561 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
562 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
563
564 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
565 // uint8_t *dst, ptrdiff_t dst_stride,
566 // const int16_t *filter_x, int x_step_q4,
567 // const int16_t *filter_y, int y_step_q4,
568 // int w, int h);
569 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
570 // uint8_t *dst, ptrdiff_t dst_stride,
571 // const int16_t *filter_x, int x_step_q4,
572 // const int16_t *filter_y, int y_step_q4,
573 // int w, int h);
574 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
575 // uint8_t *dst, ptrdiff_t dst_stride,
576 // const int16_t *filter_x, int x_step_q4,
577 // const int16_t *filter_y, int y_step_q4,
578 // int w, int h);
579 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
580 // uint8_t *dst, ptrdiff_t dst_stride,
581 // const int16_t *filter_x, int x_step_q4,
582 // const int16_t *filter_y, int y_step_q4,
583 // int w, int h);
584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
588 ssse3);
589
590 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
591 // uint8_t *dst, ptrdiff_t dst_stride,
592 // const int16_t *filter_x, int x_step_q4,
593 // const int16_t *filter_y, int y_step_q4,
594 // int w, int h);
595 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
596 // uint8_t *dst, ptrdiff_t dst_stride,
597 // const int16_t *filter_x, int x_step_q4,
598 // const int16_t *filter_y, int y_step_q4,
599 // int w, int h);
600 FUN_CONV_2D(, ssse3);
601 FUN_CONV_2D(avg_ , ssse3);
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698