OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 | 12 |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 #include "./vp9_rtcd.h" | 14 #include "./vp9_rtcd.h" |
15 #include "vpx_ports/mem.h" | 15 #include "vpx_ports/mem.h" |
16 | 16 |
17 typedef void filter8_1dfunction ( | 17 typedef void filter8_1dfunction ( |
18 const unsigned char *src_ptr, | 18 const unsigned char *src_ptr, |
19 const unsigned int src_pitch, | 19 const ptrdiff_t src_pitch, |
20 unsigned char *output_ptr, | 20 unsigned char *output_ptr, |
21 unsigned int out_pitch, | 21 ptrdiff_t out_pitch, |
22 unsigned int output_height, | 22 unsigned int output_height, |
23 const short *filter | 23 const short *filter |
24 ); | 24 ); |
25 | 25 |
| 26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ |
| 27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ |
| 28 uint8_t *dst, ptrdiff_t dst_stride, \ |
| 29 const int16_t *filter_x, int x_step_q4, \ |
| 30 const int16_t *filter_y, int y_step_q4, \ |
| 31 int w, int h) { \ |
| 32 if (step_q4 == 16 && filter[3] != 128) { \ |
| 33 if (filter[0] || filter[1] || filter[2]) { \ |
| 34 while (w >= 16) { \ |
| 35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ |
| 36 src_stride, \ |
| 37 dst, \ |
| 38 dst_stride, \ |
| 39 h, \ |
| 40 filter); \ |
| 41 src += 16; \ |
| 42 dst += 16; \ |
| 43 w -= 16; \ |
| 44 } \ |
| 45 while (w >= 8) { \ |
| 46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ |
| 47 src_stride, \ |
| 48 dst, \ |
| 49 dst_stride, \ |
| 50 h, \ |
| 51 filter); \ |
| 52 src += 8; \ |
| 53 dst += 8; \ |
| 54 w -= 8; \ |
| 55 } \ |
| 56 while (w >= 4) { \ |
| 57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ |
| 58 src_stride, \ |
| 59 dst, \ |
| 60 dst_stride, \ |
| 61 h, \ |
| 62 filter); \ |
| 63 src += 4; \ |
| 64 dst += 4; \ |
| 65 w -= 4; \ |
| 66 } \ |
| 67 } else { \ |
| 68 while (w >= 16) { \ |
| 69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \ |
| 70 src_stride, \ |
| 71 dst, \ |
| 72 dst_stride, \ |
| 73 h, \ |
| 74 filter); \ |
| 75 src += 16; \ |
| 76 dst += 16; \ |
| 77 w -= 16; \ |
| 78 } \ |
| 79 while (w >= 8) { \ |
| 80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \ |
| 81 src_stride, \ |
| 82 dst, \ |
| 83 dst_stride, \ |
| 84 h, \ |
| 85 filter); \ |
| 86 src += 8; \ |
| 87 dst += 8; \ |
| 88 w -= 8; \ |
| 89 } \ |
| 90 while (w >= 4) { \ |
| 91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \ |
| 92 src_stride, \ |
| 93 dst, \ |
| 94 dst_stride, \ |
| 95 h, \ |
| 96 filter); \ |
| 97 src += 4; \ |
| 98 dst += 4; \ |
| 99 w -= 4; \ |
| 100 } \ |
| 101 } \ |
| 102 } \ |
| 103 if (w) { \ |
| 104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ |
| 105 filter_x, x_step_q4, filter_y, y_step_q4, \ |
| 106 w, h); \ |
| 107 } \ |
| 108 } |
| 109 |
| 110 #define FUN_CONV_2D(avg, opt) \ |
| 111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ |
| 112 uint8_t *dst, ptrdiff_t dst_stride, \ |
| 113 const int16_t *filter_x, int x_step_q4, \ |
| 114 const int16_t *filter_y, int y_step_q4, \ |
| 115 int w, int h) { \ |
| 116 assert(w <= 64); \ |
| 117 assert(h <= 64); \ |
| 118 if (x_step_q4 == 16 && y_step_q4 == 16) { \ |
| 119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ |
| 120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ |
| 121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ |
| 122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ |
| 123 filter_x, x_step_q4, filter_y, y_step_q4, \ |
| 124 w, h + 7); \ |
| 125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ |
| 126 filter_x, x_step_q4, filter_y, \ |
| 127 y_step_q4, w, h); \ |
| 128 } else { \ |
| 129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ |
| 130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ |
| 131 filter_x, x_step_q4, filter_y, y_step_q4, \ |
| 132 w, h + 1); \ |
| 133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ |
| 134 filter_x, x_step_q4, filter_y, \ |
| 135 y_step_q4, w, h); \ |
| 136 } \ |
| 137 } else { \ |
| 138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ |
| 139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ |
| 140 } \ |
| 141 } |
| 142 #if HAVE_AVX2 |
| 143 filter8_1dfunction vp9_filter_block1d16_v8_avx2; |
| 144 filter8_1dfunction vp9_filter_block1d16_h8_avx2; |
| 145 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 146 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 147 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 148 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 149 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 150 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 151 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 152 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 153 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 154 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 155 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 |
| 156 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 |
| 157 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 |
| 158 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 |
| 159 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 |
| 160 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 |
| 161 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 |
| 162 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 |
| 163 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 |
| 164 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 |
| 165 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 166 // uint8_t *dst, ptrdiff_t dst_stride, |
| 167 // const int16_t *filter_x, int x_step_q4, |
| 168 // const int16_t *filter_y, int y_step_q4, |
| 169 // int w, int h); |
| 170 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 171 // uint8_t *dst, ptrdiff_t dst_stride, |
| 172 // const int16_t *filter_x, int x_step_q4, |
| 173 // const int16_t *filter_y, int y_step_q4, |
| 174 // int w, int h); |
| 175 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); |
| 176 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); |
| 177 |
| 178 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 179 // uint8_t *dst, ptrdiff_t dst_stride, |
| 180 // const int16_t *filter_x, int x_step_q4, |
| 181 // const int16_t *filter_y, int y_step_q4, |
| 182 // int w, int h); |
| 183 FUN_CONV_2D(, avx2); |
| 184 #endif |
26 #if HAVE_SSSE3 | 185 #if HAVE_SSSE3 |
27 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; | 186 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
28 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; | 187 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
29 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; | 188 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
30 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; | 189 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
31 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; | 190 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
32 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; | 191 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
33 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; | 192 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
34 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; | 193 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
35 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; | 194 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
36 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; | 195 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
37 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; | 196 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
38 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; | 197 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
39 | 198 |
40 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 199 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
41 uint8_t *dst, ptrdiff_t dst_stride, | 200 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
42 const int16_t *filter_x, int x_step_q4, | 201 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
43 const int16_t *filter_y, int y_step_q4, | 202 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
44 int w, int h) { | 203 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
45 /* Ensure the filter can be compressed to int16_t. */ | 204 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
46 if (x_step_q4 == 16 && filter_x[3] != 128) { | 205 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; |
47 while (w >= 16) { | 206 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; |
48 vp9_filter_block1d16_h8_ssse3(src, src_stride, | 207 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; |
49 dst, dst_stride, | 208 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; |
50 h, filter_x); | 209 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; |
51 src += 16; | 210 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; |
52 dst += 16; | |
53 w -= 16; | |
54 } | |
55 while (w >= 8) { | |
56 vp9_filter_block1d8_h8_ssse3(src, src_stride, | |
57 dst, dst_stride, | |
58 h, filter_x); | |
59 src += 8; | |
60 dst += 8; | |
61 w -= 8; | |
62 } | |
63 while (w >= 4) { | |
64 vp9_filter_block1d4_h8_ssse3(src, src_stride, | |
65 dst, dst_stride, | |
66 h, filter_x); | |
67 src += 4; | |
68 dst += 4; | |
69 w -= 4; | |
70 } | |
71 } | |
72 if (w) { | |
73 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, | |
74 filter_x, x_step_q4, filter_y, y_step_q4, | |
75 w, h); | |
76 } | |
77 } | |
78 | 211 |
79 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 212 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
80 uint8_t *dst, ptrdiff_t dst_stride, | 213 // uint8_t *dst, ptrdiff_t dst_stride, |
81 const int16_t *filter_x, int x_step_q4, | 214 // const int16_t *filter_x, int x_step_q4, |
82 const int16_t *filter_y, int y_step_q4, | 215 // const int16_t *filter_y, int y_step_q4, |
83 int w, int h) { | 216 // int w, int h); |
84 if (y_step_q4 == 16 && filter_y[3] != 128) { | 217 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
85 while (w >= 16) { | 218 // uint8_t *dst, ptrdiff_t dst_stride, |
86 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, | 219 // const int16_t *filter_x, int x_step_q4, |
87 dst, dst_stride, | 220 // const int16_t *filter_y, int y_step_q4, |
88 h, filter_y); | 221 // int w, int h); |
89 src += 16; | 222 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
90 dst += 16; | 223 // uint8_t *dst, ptrdiff_t dst_stride, |
91 w -= 16; | 224 // const int16_t *filter_x, int x_step_q4, |
92 } | 225 // const int16_t *filter_y, int y_step_q4, |
93 while (w >= 8) { | 226 // int w, int h); |
94 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, | 227 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
95 dst, dst_stride, | 228 // uint8_t *dst, ptrdiff_t dst_stride, |
96 h, filter_y); | 229 // const int16_t *filter_x, int x_step_q4, |
97 src += 8; | 230 // const int16_t *filter_y, int y_step_q4, |
98 dst += 8; | 231 // int w, int h); |
99 w -= 8; | 232 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); |
100 } | 233 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); |
101 while (w >= 4) { | 234 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); |
102 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, | 235 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, |
103 dst, dst_stride, | 236 ssse3); |
104 h, filter_y); | |
105 src += 4; | |
106 dst += 4; | |
107 w -= 4; | |
108 } | |
109 } | |
110 if (w) { | |
111 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, | |
112 filter_x, x_step_q4, filter_y, y_step_q4, | |
113 w, h); | |
114 } | |
115 } | |
116 | 237 |
117 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 238 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
118 uint8_t *dst, ptrdiff_t dst_stride, | 239 // uint8_t *dst, ptrdiff_t dst_stride, |
119 const int16_t *filter_x, int x_step_q4, | 240 // const int16_t *filter_x, int x_step_q4, |
120 const int16_t *filter_y, int y_step_q4, | 241 // const int16_t *filter_y, int y_step_q4, |
121 int w, int h) { | 242 // int w, int h); |
122 if (x_step_q4 == 16 && filter_x[3] != 128) { | 243 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
123 while (w >= 16) { | 244 // uint8_t *dst, ptrdiff_t dst_stride, |
124 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, | 245 // const int16_t *filter_x, int x_step_q4, |
125 dst, dst_stride, | 246 // const int16_t *filter_y, int y_step_q4, |
126 h, filter_x); | 247 // int w, int h); |
127 src += 16; | 248 FUN_CONV_2D(, ssse3); |
128 dst += 16; | 249 FUN_CONV_2D(avg_ , ssse3); |
129 w -= 16; | |
130 } | |
131 while (w >= 8) { | |
132 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, | |
133 dst, dst_stride, | |
134 h, filter_x); | |
135 src += 8; | |
136 dst += 8; | |
137 w -= 8; | |
138 } | |
139 while (w >= 4) { | |
140 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, | |
141 dst, dst_stride, | |
142 h, filter_x); | |
143 src += 4; | |
144 dst += 4; | |
145 w -= 4; | |
146 } | |
147 } | |
148 if (w) { | |
149 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | |
150 filter_x, x_step_q4, filter_y, y_step_q4, | |
151 w, h); | |
152 } | |
153 } | |
154 | |
155 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, | |
156 uint8_t *dst, ptrdiff_t dst_stride, | |
157 const int16_t *filter_x, int x_step_q4, | |
158 const int16_t *filter_y, int y_step_q4, | |
159 int w, int h) { | |
160 if (y_step_q4 == 16 && filter_y[3] != 128) { | |
161 while (w >= 16) { | |
162 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, | |
163 dst, dst_stride, | |
164 h, filter_y); | |
165 src += 16; | |
166 dst += 16; | |
167 w -= 16; | |
168 } | |
169 while (w >= 8) { | |
170 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, | |
171 dst, dst_stride, | |
172 h, filter_y); | |
173 src += 8; | |
174 dst += 8; | |
175 w -= 8; | |
176 } | |
177 while (w >= 4) { | |
178 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, | |
179 dst, dst_stride, | |
180 h, filter_y); | |
181 src += 4; | |
182 dst += 4; | |
183 w -= 4; | |
184 } | |
185 } | |
186 if (w) { | |
187 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | |
188 filter_x, x_step_q4, filter_y, y_step_q4, | |
189 w, h); | |
190 } | |
191 } | |
192 | |
193 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, | |
194 uint8_t *dst, ptrdiff_t dst_stride, | |
195 const int16_t *filter_x, int x_step_q4, | |
196 const int16_t *filter_y, int y_step_q4, | |
197 int w, int h) { | |
198 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
199 | |
200 assert(w <= 64); | |
201 assert(h <= 64); | |
202 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
203 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | |
204 filter_x, x_step_q4, filter_y, y_step_q4, | |
205 w, h + 7); | |
206 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | |
207 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
208 } else { | |
209 vp9_convolve8_c(src, src_stride, dst, dst_stride, | |
210 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
211 } | |
212 } | |
213 | |
214 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | |
215 uint8_t *dst, ptrdiff_t dst_stride, | |
216 const int16_t *filter_x, int x_step_q4, | |
217 const int16_t *filter_y, int y_step_q4, | |
218 int w, int h) { | |
219 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
220 | |
221 assert(w <= 64); | |
222 assert(h <= 64); | |
223 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
224 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | |
225 filter_x, x_step_q4, filter_y, y_step_q4, | |
226 w, h + 7); | |
227 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | |
228 filter_x, x_step_q4, filter_y, y_step_q4, | |
229 w, h); | |
230 } else { | |
231 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | |
232 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
233 } | |
234 } | |
235 #endif | 250 #endif |
236 | 251 |
237 #if HAVE_SSE2 | 252 #if HAVE_SSE2 |
238 filter8_1dfunction vp9_filter_block1d16_v8_sse2; | 253 filter8_1dfunction vp9_filter_block1d16_v8_sse2; |
239 filter8_1dfunction vp9_filter_block1d16_h8_sse2; | 254 filter8_1dfunction vp9_filter_block1d16_h8_sse2; |
240 filter8_1dfunction vp9_filter_block1d8_v8_sse2; | 255 filter8_1dfunction vp9_filter_block1d8_v8_sse2; |
241 filter8_1dfunction vp9_filter_block1d8_h8_sse2; | 256 filter8_1dfunction vp9_filter_block1d8_h8_sse2; |
242 filter8_1dfunction vp9_filter_block1d4_v8_sse2; | 257 filter8_1dfunction vp9_filter_block1d4_v8_sse2; |
243 filter8_1dfunction vp9_filter_block1d4_h8_sse2; | 258 filter8_1dfunction vp9_filter_block1d4_h8_sse2; |
244 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; | 259 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; |
245 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; | 260 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; |
246 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; | 261 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; |
247 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; | 262 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; |
248 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; | 263 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; |
249 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; | 264 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; |
250 | 265 |
251 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, | 266 filter8_1dfunction vp9_filter_block1d16_v2_sse2; |
252 uint8_t *dst, ptrdiff_t dst_stride, | 267 filter8_1dfunction vp9_filter_block1d16_h2_sse2; |
253 const int16_t *filter_x, int x_step_q4, | 268 filter8_1dfunction vp9_filter_block1d8_v2_sse2; |
254 const int16_t *filter_y, int y_step_q4, | 269 filter8_1dfunction vp9_filter_block1d8_h2_sse2; |
255 int w, int h) { | 270 filter8_1dfunction vp9_filter_block1d4_v2_sse2; |
256 /* Ensure the filter can be compressed to int16_t. */ | 271 filter8_1dfunction vp9_filter_block1d4_h2_sse2; |
257 if (x_step_q4 == 16 && filter_x[3] != 128) { | 272 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; |
258 while (w >= 16) { | 273 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; |
259 vp9_filter_block1d16_h8_sse2(src, src_stride, | 274 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; |
260 dst, dst_stride, | 275 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; |
261 h, filter_x); | 276 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; |
262 src += 16; | 277 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; |
263 dst += 16; | |
264 w -= 16; | |
265 } | |
266 while (w >= 8) { | |
267 vp9_filter_block1d8_h8_sse2(src, src_stride, | |
268 dst, dst_stride, | |
269 h, filter_x); | |
270 src += 8; | |
271 dst += 8; | |
272 w -= 8; | |
273 } | |
274 while (w >= 4) { | |
275 vp9_filter_block1d4_h8_sse2(src, src_stride, | |
276 dst, dst_stride, | |
277 h, filter_x); | |
278 src += 4; | |
279 dst += 4; | |
280 w -= 4; | |
281 } | |
282 } | |
283 if (w) { | |
284 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, | |
285 filter_x, x_step_q4, filter_y, y_step_q4, | |
286 w, h); | |
287 } | |
288 } | |
289 | 278 |
290 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, | 279 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
291 uint8_t *dst, ptrdiff_t dst_stride, | 280 // uint8_t *dst, ptrdiff_t dst_stride, |
292 const int16_t *filter_x, int x_step_q4, | 281 // const int16_t *filter_x, int x_step_q4, |
293 const int16_t *filter_y, int y_step_q4, | 282 // const int16_t *filter_y, int y_step_q4, |
294 int w, int h) { | 283 // int w, int h); |
295 if (y_step_q4 == 16 && filter_y[3] != 128) { | 284 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
296 while (w >= 16) { | 285 // uint8_t *dst, ptrdiff_t dst_stride, |
297 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, | 286 // const int16_t *filter_x, int x_step_q4, |
298 dst, dst_stride, | 287 // const int16_t *filter_y, int y_step_q4, |
299 h, filter_y); | 288 // int w, int h); |
300 src += 16; | 289 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
301 dst += 16; | 290 // uint8_t *dst, ptrdiff_t dst_stride, |
302 w -= 16; | 291 // const int16_t *filter_x, int x_step_q4, |
303 } | 292 // const int16_t *filter_y, int y_step_q4, |
304 while (w >= 8) { | 293 // int w, int h); |
305 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, | 294 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
306 dst, dst_stride, | 295 // uint8_t *dst, ptrdiff_t dst_stride, |
307 h, filter_y); | 296 // const int16_t *filter_x, int x_step_q4, |
308 src += 8; | 297 // const int16_t *filter_y, int y_step_q4, |
309 dst += 8; | 298 // int w, int h); |
310 w -= 8; | 299 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); |
311 } | 300 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); |
312 while (w >= 4) { | 301 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); |
313 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, | 302 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); |
314 dst, dst_stride, | |
315 h, filter_y); | |
316 src += 4; | |
317 dst += 4; | |
318 w -= 4; | |
319 } | |
320 } | |
321 if (w) { | |
322 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, | |
323 filter_x, x_step_q4, filter_y, y_step_q4, | |
324 w, h); | |
325 } | |
326 } | |
327 | 303 |
328 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, | 304 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, |
329 uint8_t *dst, ptrdiff_t dst_stride, | 305 // uint8_t *dst, ptrdiff_t dst_stride, |
330 const int16_t *filter_x, int x_step_q4, | 306 // const int16_t *filter_x, int x_step_q4, |
331 const int16_t *filter_y, int y_step_q4, | 307 // const int16_t *filter_y, int y_step_q4, |
332 int w, int h) { | 308 // int w, int h); |
333 if (x_step_q4 == 16 && filter_x[3] != 128) { | 309 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, |
334 while (w >= 16) { | 310 // uint8_t *dst, ptrdiff_t dst_stride, |
335 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, | 311 // const int16_t *filter_x, int x_step_q4, |
336 dst, dst_stride, | 312 // const int16_t *filter_y, int y_step_q4, |
337 h, filter_x); | 313 // int w, int h); |
338 src += 16; | 314 FUN_CONV_2D(, sse2); |
339 dst += 16; | 315 FUN_CONV_2D(avg_ , sse2); |
340 w -= 16; | |
341 } | |
342 while (w >= 8) { | |
343 vp9_filter_block1d8_h8_avg_sse2(src, src_stride, | |
344 dst, dst_stride, | |
345 h, filter_x); | |
346 src += 8; | |
347 dst += 8; | |
348 w -= 8; | |
349 } | |
350 while (w >= 4) { | |
351 vp9_filter_block1d4_h8_avg_sse2(src, src_stride, | |
352 dst, dst_stride, | |
353 h, filter_x); | |
354 src += 4; | |
355 dst += 4; | |
356 w -= 4; | |
357 } | |
358 } | |
359 if (w) { | |
360 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | |
361 filter_x, x_step_q4, filter_y, y_step_q4, | |
362 w, h); | |
363 } | |
364 } | |
365 | |
366 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, | |
367 uint8_t *dst, ptrdiff_t dst_stride, | |
368 const int16_t *filter_x, int x_step_q4, | |
369 const int16_t *filter_y, int y_step_q4, | |
370 int w, int h) { | |
371 if (y_step_q4 == 16 && filter_y[3] != 128) { | |
372 while (w >= 16) { | |
373 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, | |
374 dst, dst_stride, | |
375 h, filter_y); | |
376 src += 16; | |
377 dst += 16; | |
378 w -= 16; | |
379 } | |
380 while (w >= 8) { | |
381 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, | |
382 dst, dst_stride, | |
383 h, filter_y); | |
384 src += 8; | |
385 dst += 8; | |
386 w -= 8; | |
387 } | |
388 while (w >= 4) { | |
389 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, | |
390 dst, dst_stride, | |
391 h, filter_y); | |
392 src += 4; | |
393 dst += 4; | |
394 w -= 4; | |
395 } | |
396 } | |
397 if (w) { | |
398 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | |
399 filter_x, x_step_q4, filter_y, y_step_q4, | |
400 w, h); | |
401 } | |
402 } | |
403 | |
404 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, | |
405 uint8_t *dst, ptrdiff_t dst_stride, | |
406 const int16_t *filter_x, int x_step_q4, | |
407 const int16_t *filter_y, int y_step_q4, | |
408 int w, int h) { | |
409 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
410 | |
411 assert(w <= 64); | |
412 assert(h <= 64); | |
413 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
414 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, | |
415 filter_x, x_step_q4, filter_y, y_step_q4, | |
416 w, h + 7); | |
417 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, | |
418 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
419 } else { | |
420 vp9_convolve8_c(src, src_stride, dst, dst_stride, | |
421 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
422 } | |
423 } | |
424 | |
425 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, | |
426 uint8_t *dst, ptrdiff_t dst_stride, | |
427 const int16_t *filter_x, int x_step_q4, | |
428 const int16_t *filter_y, int y_step_q4, | |
429 int w, int h) { | |
430 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
431 | |
432 assert(w <= 64); | |
433 assert(h <= 64); | |
434 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
435 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, | |
436 filter_x, x_step_q4, filter_y, y_step_q4, | |
437 w, h + 7); | |
438 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, | |
439 filter_x, x_step_q4, filter_y, y_step_q4, | |
440 w, h); | |
441 } else { | |
442 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | |
443 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
444 } | |
445 } | |
446 #endif | 316 #endif |
OLD | NEW |