OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
12 #include "vpx_config.h" | 12 #include "vpx_config.h" |
13 #include "vp9/common/vp9_loopfilter.h" | 13 #include "vp9/common/vp9_loopfilter.h" |
| 14 #include "vpx_ports/emmintrin_compat.h" |
14 | 15 |
15 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx); | 16 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx); |
16 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx); | 17 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx); |
17 | 18 |
18 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); | 19 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); |
19 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); | 20 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); |
20 | 21 |
21 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; | 22 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; |
22 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; | 23 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; |
23 | 24 |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
77 | 78 |
78 void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, | 79 void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, |
79 const unsigned char *blimit) { | 80 const unsigned char *blimit) { |
80 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); | 81 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); |
81 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); | 82 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); |
82 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); | 83 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); |
83 } | 84 } |
84 #endif | 85 #endif |
85 | 86 |
86 #if HAVE_SSE2 | 87 #if HAVE_SSE2 |
| 88 |
| 89 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, |
| 90 int p, |
| 91 const unsigned char *_blimit, |
| 92 const unsigned char *_limit, |
| 93 const unsigned char *_thresh) { |
| 94 DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); |
| 95 DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); |
| 96 |
| 97 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); |
| 98 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); |
| 99 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); |
| 100 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); |
| 101 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); |
| 102 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); |
| 103 __m128i mask, hev, flat, flat2; |
| 104 const __m128i zero = _mm_set1_epi16(0); |
| 105 __m128i p7, p6, p5; |
| 106 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; |
| 107 __m128i q5, q6, q7; |
| 108 int i = 0; |
| 109 const unsigned int extended_thresh = _thresh[0] * 0x01010101u; |
| 110 const unsigned int extended_limit = _limit[0] * 0x01010101u; |
| 111 const unsigned int extended_blimit = _blimit[0] * 0x01010101u; |
| 112 const __m128i thresh = |
| 113 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); |
| 114 const __m128i limit = |
| 115 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); |
| 116 const __m128i blimit = |
| 117 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); |
| 118 |
| 119 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); |
| 120 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); |
| 121 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); |
| 122 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); |
| 123 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); |
| 124 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); |
| 125 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); |
| 126 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); |
| 127 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); |
| 128 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); |
| 129 { |
| 130 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), |
| 131 _mm_subs_epu8(p0, p1)); |
| 132 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), |
| 133 _mm_subs_epu8(q0, q1)); |
| 134 const __m128i one = _mm_set1_epi8(1); |
| 135 const __m128i fe = _mm_set1_epi8(0xfe); |
| 136 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); |
| 137 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), |
| 138 _mm_subs_epu8(q0, p0)); |
| 139 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), |
| 140 _mm_subs_epu8(q1, p1)); |
| 141 __m128i work; |
| 142 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); |
| 143 hev = _mm_subs_epu8(flat, thresh); |
| 144 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); |
| 145 |
| 146 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); |
| 147 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); |
| 148 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); |
| 149 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); |
| 150 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
| 151 mask = _mm_max_epu8(flat, mask); |
| 152 // mask |= (abs(p1 - p0) > limit) * -1; |
| 153 // mask |= (abs(q1 - q0) > limit) * -1; |
| 154 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), |
| 155 _mm_subs_epu8(p1, p2)), |
| 156 _mm_or_si128(_mm_subs_epu8(p3, p2), |
| 157 _mm_subs_epu8(p2, p3))); |
| 158 mask = _mm_max_epu8(work, mask); |
| 159 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), |
| 160 _mm_subs_epu8(q1, q2)), |
| 161 _mm_or_si128(_mm_subs_epu8(q3, q2), |
| 162 _mm_subs_epu8(q2, q3))); |
| 163 mask = _mm_max_epu8(work, mask); |
| 164 mask = _mm_subs_epu8(mask, limit); |
| 165 mask = _mm_cmpeq_epi8(mask, zero); |
| 166 |
| 167 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), |
| 168 _mm_subs_epu8(p0, p2)), |
| 169 _mm_or_si128(_mm_subs_epu8(q2, q0), |
| 170 _mm_subs_epu8(q0, q2))); |
| 171 flat = _mm_max_epu8(work, flat); |
| 172 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), |
| 173 _mm_subs_epu8(p0, p3)), |
| 174 _mm_or_si128(_mm_subs_epu8(q3, q0), |
| 175 _mm_subs_epu8(q0, q3))); |
| 176 flat = _mm_max_epu8(work, flat); |
| 177 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), |
| 178 _mm_subs_epu8(p0, p4)), |
| 179 _mm_or_si128(_mm_subs_epu8(q4, q0), |
| 180 _mm_subs_epu8(q0, q4))); |
| 181 flat = _mm_max_epu8(work, flat); |
| 182 flat = _mm_subs_epu8(flat, one); |
| 183 flat = _mm_cmpeq_epi8(flat, zero); |
| 184 flat = _mm_and_si128(flat, mask); |
| 185 } |
| 186 |
| 187 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 188 // calculate flat2 |
| 189 p4 = _mm_loadu_si128((__m128i *)(s - 8 * p)); |
| 190 p3 = _mm_loadu_si128((__m128i *)(s - 7 * p)); |
| 191 p2 = _mm_loadu_si128((__m128i *)(s - 6 * p)); |
| 192 p1 = _mm_loadu_si128((__m128i *)(s - 5 * p)); |
| 193 // p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); |
| 194 // q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); |
| 195 q1 = _mm_loadu_si128((__m128i *)(s + 4 * p)); |
| 196 q2 = _mm_loadu_si128((__m128i *)(s + 5 * p)); |
| 197 q3 = _mm_loadu_si128((__m128i *)(s + 6 * p)); |
| 198 q4 = _mm_loadu_si128((__m128i *)(s + 7 * p)); |
| 199 |
| 200 { |
| 201 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), |
| 202 _mm_subs_epu8(p0, p1)); |
| 203 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), |
| 204 _mm_subs_epu8(q0, q1)); |
| 205 const __m128i one = _mm_set1_epi8(1); |
| 206 __m128i work; |
| 207 flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0); |
| 208 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), |
| 209 _mm_subs_epu8(p0, p2)), |
| 210 _mm_or_si128(_mm_subs_epu8(q2, q0), |
| 211 _mm_subs_epu8(q0, q2))); |
| 212 flat2 = _mm_max_epu8(work, flat2); |
| 213 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), |
| 214 _mm_subs_epu8(p0, p3)), |
| 215 _mm_or_si128(_mm_subs_epu8(q3, q0), |
| 216 _mm_subs_epu8(q0, q3))); |
| 217 flat2 = _mm_max_epu8(work, flat2); |
| 218 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), |
| 219 _mm_subs_epu8(p0, p4)), |
| 220 _mm_or_si128(_mm_subs_epu8(q4, q0), |
| 221 _mm_subs_epu8(q0, q4))); |
| 222 flat2 = _mm_max_epu8(work, flat2); |
| 223 flat2 = _mm_subs_epu8(flat2, one); |
| 224 flat2 = _mm_cmpeq_epi8(flat2, zero); |
| 225 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask |
| 226 } |
| 227 // calculate flat2 |
| 228 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 229 |
| 230 { |
| 231 const __m128i four = _mm_set1_epi16(4); |
| 232 unsigned char *src = s; |
| 233 i = 0; |
| 234 do { |
| 235 __m128i workp_a, workp_b, workp_shft; |
| 236 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); |
| 237 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); |
| 238 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); |
| 239 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); |
| 240 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); |
| 241 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); |
| 242 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); |
| 243 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); |
| 244 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); |
| 245 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); |
| 246 |
| 247 workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); |
| 248 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); |
| 249 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); |
| 250 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); |
| 251 _mm_storel_epi64((__m128i *)&flat_op2[i*8], |
| 252 _mm_packus_epi16(workp_shft, workp_shft)); |
| 253 |
| 254 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); |
| 255 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); |
| 256 _mm_storel_epi64((__m128i *)&flat_op1[i*8], |
| 257 _mm_packus_epi16(workp_shft, workp_shft)); |
| 258 |
| 259 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); |
| 260 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); |
| 261 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); |
| 262 _mm_storel_epi64((__m128i *)&flat_op0[i*8], |
| 263 _mm_packus_epi16(workp_shft, workp_shft)); |
| 264 |
| 265 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); |
| 266 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); |
| 267 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); |
| 268 _mm_storel_epi64((__m128i *)&flat_oq0[i*8], |
| 269 _mm_packus_epi16(workp_shft, workp_shft)); |
| 270 |
| 271 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); |
| 272 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); |
| 273 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); |
| 274 _mm_storel_epi64((__m128i *)&flat_oq1[i*8], |
| 275 _mm_packus_epi16(workp_shft, workp_shft)); |
| 276 |
| 277 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); |
| 278 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); |
| 279 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); |
| 280 _mm_storel_epi64((__m128i *)&flat_oq2[i*8], |
| 281 _mm_packus_epi16(workp_shft, workp_shft)); |
| 282 |
| 283 src += 8; |
| 284 } while (++i < 2); |
| 285 } |
| 286 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 287 // wide flat |
| 288 // TODO(slavarnway): interleave with the flat pixel calculations (see above) |
| 289 { |
| 290 const __m128i eight = _mm_set1_epi16(8); |
| 291 unsigned char *src = s; |
| 292 int i = 0; |
| 293 do { |
| 294 __m128i workp_a, workp_b, workp_shft; |
| 295 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero); |
| 296 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero); |
| 297 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero); |
| 298 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); |
| 299 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); |
| 300 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); |
| 301 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); |
| 302 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); |
| 303 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); |
| 304 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); |
| 305 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); |
| 306 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); |
| 307 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); |
| 308 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero); |
| 309 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero); |
| 310 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero); |
| 311 |
| 312 |
| 313 workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 |
| 314 workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a); |
| 315 workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2)); |
| 316 workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a); |
| 317 workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b); |
| 318 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 319 _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], |
| 320 _mm_packus_epi16(workp_shft, workp_shft)); |
| 321 |
| 322 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5); |
| 323 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1); |
| 324 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 325 _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], |
| 326 _mm_packus_epi16(workp_shft, workp_shft)); |
| 327 |
| 328 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4); |
| 329 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2); |
| 330 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 331 _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], |
| 332 _mm_packus_epi16(workp_shft, workp_shft)); |
| 333 |
| 334 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3); |
| 335 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3); |
| 336 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 337 _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], |
| 338 _mm_packus_epi16(workp_shft, workp_shft)); |
| 339 |
| 340 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2); |
| 341 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4); |
| 342 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 343 _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], |
| 344 _mm_packus_epi16(workp_shft, workp_shft)); |
| 345 |
| 346 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1); |
| 347 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5); |
| 348 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 349 _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], |
| 350 _mm_packus_epi16(workp_shft, workp_shft)); |
| 351 |
| 352 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0); |
| 353 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6); |
| 354 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 355 _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], |
| 356 _mm_packus_epi16(workp_shft, workp_shft)); |
| 357 |
| 358 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0); |
| 359 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7); |
| 360 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 361 _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], |
| 362 _mm_packus_epi16(workp_shft, workp_shft)); |
| 363 |
| 364 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1); |
| 365 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7); |
| 366 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 367 _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], |
| 368 _mm_packus_epi16(workp_shft, workp_shft)); |
| 369 |
| 370 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2); |
| 371 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7); |
| 372 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 373 _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], |
| 374 _mm_packus_epi16(workp_shft, workp_shft)); |
| 375 |
| 376 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3); |
| 377 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7); |
| 378 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 379 _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], |
| 380 _mm_packus_epi16(workp_shft, workp_shft)); |
| 381 |
| 382 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4); |
| 383 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7); |
| 384 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 385 _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], |
| 386 _mm_packus_epi16(workp_shft, workp_shft)); |
| 387 |
| 388 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5); |
| 389 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7); |
| 390 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 391 _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], |
| 392 _mm_packus_epi16(workp_shft, workp_shft)); |
| 393 |
| 394 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6); |
| 395 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7); |
| 396 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); |
| 397 _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], |
| 398 _mm_packus_epi16(workp_shft, workp_shft)); |
| 399 |
| 400 src += 8; |
| 401 } while (++i < 2); |
| 402 } |
| 403 // wide flat |
| 404 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 405 |
| 406 // lp filter |
| 407 { |
| 408 const __m128i t4 = _mm_set1_epi8(4); |
| 409 const __m128i t3 = _mm_set1_epi8(3); |
| 410 const __m128i t80 = _mm_set1_epi8(0x80); |
| 411 const __m128i te0 = _mm_set1_epi8(0xe0); |
| 412 const __m128i t1f = _mm_set1_epi8(0x1f); |
| 413 const __m128i t1 = _mm_set1_epi8(0x1); |
| 414 const __m128i t7f = _mm_set1_epi8(0x7f); |
| 415 |
| 416 __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), |
| 417 t80); |
| 418 __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), |
| 419 t80); |
| 420 __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), |
| 421 t80); |
| 422 __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), |
| 423 t80); |
| 424 __m128i filt; |
| 425 __m128i work_a; |
| 426 __m128i filter1, filter2; |
| 427 |
| 428 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); |
| 429 work_a = _mm_subs_epi8(qs0, ps0); |
| 430 filt = _mm_adds_epi8(filt, work_a); |
| 431 filt = _mm_adds_epi8(filt, work_a); |
| 432 filt = _mm_adds_epi8(filt, work_a); |
| 433 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ |
| 434 filt = _mm_and_si128(filt, mask); |
| 435 |
| 436 filter1 = _mm_adds_epi8(filt, t4); |
| 437 filter2 = _mm_adds_epi8(filt, t3); |
| 438 |
| 439 /* Filter1 >> 3 */ |
| 440 work_a = _mm_cmpgt_epi8(zero, filter1); |
| 441 filter1 = _mm_srli_epi16(filter1, 3); |
| 442 work_a = _mm_and_si128(work_a, te0); |
| 443 filter1 = _mm_and_si128(filter1, t1f); |
| 444 filter1 = _mm_or_si128(filter1, work_a); |
| 445 |
| 446 /* Filter2 >> 3 */ |
| 447 work_a = _mm_cmpgt_epi8(zero, filter2); |
| 448 filter2 = _mm_srli_epi16(filter2, 3); |
| 449 work_a = _mm_and_si128(work_a, te0); |
| 450 filter2 = _mm_and_si128(filter2, t1f); |
| 451 filter2 = _mm_or_si128(filter2, work_a); |
| 452 |
| 453 /* filt >> 1 */ |
| 454 filt = _mm_adds_epi8(filter1, t1); |
| 455 work_a = _mm_cmpgt_epi8(zero, filt); |
| 456 filt = _mm_srli_epi16(filt, 1); |
| 457 work_a = _mm_and_si128(work_a, t80); |
| 458 filt = _mm_and_si128(filt, t7f); |
| 459 filt = _mm_or_si128(filt, work_a); |
| 460 |
| 461 filt = _mm_andnot_si128(hev, filt); |
| 462 |
| 463 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); |
| 464 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); |
| 465 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); |
| 466 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); |
| 467 |
| 468 // write out op6 - op3 |
| 469 { |
| 470 unsigned char *dst = (s - 7 * p); |
| 471 for (i = 6; i > 2; i--) { |
| 472 __m128i flat2_output; |
| 473 work_a = _mm_loadu_si128((__m128i *)dst); |
| 474 flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); |
| 475 work_a = _mm_andnot_si128(flat2, work_a); |
| 476 flat2_output = _mm_and_si128(flat2, flat2_output); |
| 477 work_a = _mm_or_si128(work_a, flat2_output); |
| 478 _mm_storeu_si128((__m128i *)dst, work_a); |
| 479 dst += p; |
| 480 } |
| 481 } |
| 482 |
| 483 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); |
| 484 p2 = _mm_load_si128((__m128i *)flat_op2); |
| 485 work_a = _mm_andnot_si128(flat, work_a); |
| 486 p2 = _mm_and_si128(flat, p2); |
| 487 work_a = _mm_or_si128(work_a, p2); |
| 488 p2 = _mm_load_si128((__m128i *)flat2_op[2]); |
| 489 work_a = _mm_andnot_si128(flat2, work_a); |
| 490 p2 = _mm_and_si128(flat2, p2); |
| 491 p2 = _mm_or_si128(work_a, p2); |
| 492 _mm_storeu_si128((__m128i *)(s - 3 * p), p2); |
| 493 |
| 494 p1 = _mm_load_si128((__m128i *)flat_op1); |
| 495 work_a = _mm_andnot_si128(flat, ps1); |
| 496 p1 = _mm_and_si128(flat, p1); |
| 497 work_a = _mm_or_si128(work_a, p1); |
| 498 p1 = _mm_load_si128((__m128i *)flat2_op[1]); |
| 499 work_a = _mm_andnot_si128(flat2, work_a); |
| 500 p1 = _mm_and_si128(flat2, p1); |
| 501 p1 = _mm_or_si128(work_a, p1); |
| 502 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); |
| 503 |
| 504 p0 = _mm_load_si128((__m128i *)flat_op0); |
| 505 work_a = _mm_andnot_si128(flat, ps0); |
| 506 p0 = _mm_and_si128(flat, p0); |
| 507 work_a = _mm_or_si128(work_a, p0); |
| 508 p0 = _mm_load_si128((__m128i *)flat2_op[0]); |
| 509 work_a = _mm_andnot_si128(flat2, work_a); |
| 510 p0 = _mm_and_si128(flat2, p0); |
| 511 p0 = _mm_or_si128(work_a, p0); |
| 512 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); |
| 513 |
| 514 q0 = _mm_load_si128((__m128i *)flat_oq0); |
| 515 work_a = _mm_andnot_si128(flat, qs0); |
| 516 q0 = _mm_and_si128(flat, q0); |
| 517 work_a = _mm_or_si128(work_a, q0); |
| 518 q0 = _mm_load_si128((__m128i *)flat2_oq[0]); |
| 519 work_a = _mm_andnot_si128(flat2, work_a); |
| 520 q0 = _mm_and_si128(flat2, q0); |
| 521 q0 = _mm_or_si128(work_a, q0); |
| 522 _mm_storeu_si128((__m128i *)(s - 0 * p), q0); |
| 523 |
| 524 q1 = _mm_load_si128((__m128i *)flat_oq1); |
| 525 work_a = _mm_andnot_si128(flat, qs1); |
| 526 q1 = _mm_and_si128(flat, q1); |
| 527 work_a = _mm_or_si128(work_a, q1); |
| 528 q1 = _mm_load_si128((__m128i *)flat2_oq[1]); |
| 529 work_a = _mm_andnot_si128(flat2, work_a); |
| 530 q1 = _mm_and_si128(flat2, q1); |
| 531 q1 = _mm_or_si128(work_a, q1); |
| 532 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); |
| 533 |
| 534 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); |
| 535 q2 = _mm_load_si128((__m128i *)flat_oq2); |
| 536 work_a = _mm_andnot_si128(flat, work_a); |
| 537 q2 = _mm_and_si128(flat, q2); |
| 538 work_a = _mm_or_si128(work_a, q2); |
| 539 q2 = _mm_load_si128((__m128i *)flat2_oq[2]); |
| 540 work_a = _mm_andnot_si128(flat2, work_a); |
| 541 q2 = _mm_and_si128(flat2, q2); |
| 542 q2 = _mm_or_si128(work_a, q2); |
| 543 _mm_storeu_si128((__m128i *)(s + 2 * p), q2); |
| 544 |
| 545 // write out oq3 - oq7 |
| 546 { |
| 547 unsigned char *dst = (s + 3 * p); |
| 548 for (i = 3; i < 7; i++) { |
| 549 __m128i flat2_output; |
| 550 work_a = _mm_loadu_si128((__m128i *)dst); |
| 551 flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); |
| 552 work_a = _mm_andnot_si128(flat2, work_a); |
| 553 flat2_output = _mm_and_si128(flat2, flat2_output); |
| 554 work_a = _mm_or_si128(work_a, flat2_output); |
| 555 _mm_storeu_si128((__m128i *)dst, work_a); |
| 556 dst += p; |
| 557 } |
| 558 } |
| 559 } |
| 560 } |
| 561 |
87 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, | 562 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, |
88 int p, | 563 int p, |
89 const unsigned char *_blimit, | 564 const unsigned char *_blimit, |
90 const unsigned char *_limit, | 565 const unsigned char *_limit, |
91 const unsigned char *_thresh) { | 566 const unsigned char *_thresh) { |
92 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); | 567 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); |
93 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); | 568 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); |
94 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); | 569 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); |
95 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); | 570 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); |
96 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); | 571 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); |
(...skipping 457 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
554 src[0] = t_dst + 3 * 16; | 1029 src[0] = t_dst + 3 * 16; |
555 src[1] = t_dst + 3 * 16 + 8; | 1030 src[1] = t_dst + 3 * 16 + 8; |
556 | 1031 |
557 dst[0] = s - 5; | 1032 dst[0] = s - 5; |
558 dst[1] = s - 5 + p * 8; | 1033 dst[1] = s - 5 + p * 8; |
559 | 1034 |
560 /* Transpose 16x8 */ | 1035 /* Transpose 16x8 */ |
561 transpose(src, 16, dst, p, 2); | 1036 transpose(src, 16, dst, p, 2); |
562 } | 1037 } |
563 | 1038 |
| 1039 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, |
| 1040 int p, |
| 1041 const unsigned char *blimit, |
| 1042 const unsigned char *limit, |
| 1043 const unsigned char *thresh) { |
| 1044 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); |
| 1045 unsigned char *src[4]; |
| 1046 unsigned char *dst[4]; |
| 1047 |
| 1048 /* Transpose 16x16 */ |
| 1049 transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); |
| 1050 transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); |
| 1051 |
| 1052 /* Loop filtering */ |
| 1053 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, |
| 1054 thresh); |
| 1055 |
| 1056 src[0] = t_dst; |
| 1057 src[1] = t_dst + 8 * 16; |
| 1058 src[2] = t_dst + 8; |
| 1059 src[3] = t_dst + 8 * 16 + 8; |
| 1060 |
| 1061 dst[0] = s - 8; |
| 1062 dst[1] = s - 8 + 8; |
| 1063 dst[2] = s - 8 + p * 8; |
| 1064 dst[3] = s - 8 + p * 8 + 8; |
| 1065 |
| 1066 /* Transpose 16x16 */ |
| 1067 transpose(src, 16, dst, p, 4); |
| 1068 } |
| 1069 |
| 1070 |
564 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, | 1071 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, |
565 int p, | 1072 int p, |
566 const unsigned char *blimit, | 1073 const unsigned char *blimit, |
567 const unsigned char *limit, | 1074 const unsigned char *limit, |
568 const unsigned char *thresh, | 1075 const unsigned char *thresh, |
569 unsigned char *v) { | 1076 unsigned char *v) { |
570 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); | 1077 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); |
571 unsigned char *src[2]; | 1078 unsigned char *src[2]; |
572 unsigned char *dst[2]; | 1079 unsigned char *dst[2]; |
573 | 1080 |
(...skipping 22 matching lines...) Expand all Loading... |
596 struct loop_filter_info *lfi) { | 1103 struct loop_filter_info *lfi) { |
597 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, | 1104 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, |
598 lfi->lim, lfi->hev_thr); | 1105 lfi->lim, lfi->hev_thr); |
599 | 1106 |
600 /* u,v */ | 1107 /* u,v */ |
601 if (u_ptr) | 1108 if (u_ptr) |
602 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, | 1109 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, |
603 lfi->lim, lfi->hev_thr, v_ptr); | 1110 lfi->lim, lfi->hev_thr, v_ptr); |
604 } | 1111 } |
605 | 1112 |
| 1113 |
| 1114 void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr, |
| 1115 unsigned char *v_ptr, int y_stride, int uv_stride, |
| 1116 struct loop_filter_info *lfi) { |
| 1117 vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr, y_stride, |
| 1118 lfi->mblim, lfi->lim, lfi->hev_thr); |
| 1119 |
| 1120 /* u,v */ |
| 1121 if (u_ptr) |
| 1122 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, |
| 1123 lfi->lim, lfi->hev_thr, v_ptr); |
| 1124 } |
| 1125 |
| 1126 |
606 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, | 1127 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, |
607 unsigned char *v_ptr, int y_stride, int uv_stride, | 1128 unsigned char *v_ptr, int y_stride, int uv_stride, |
608 struct loop_filter_info *lfi) { | 1129 struct loop_filter_info *lfi) { |
609 vp9_mbloop_filter_horizontal_edge_sse2( | 1130 vp9_mbloop_filter_horizontal_edge_sse2( |
610 y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); | 1131 y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); |
| 1132 |
| 1133 if (u_ptr) |
| 1134 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, |
| 1135 lfi->blim, lfi->lim, lfi->hev_thr, |
| 1136 v_ptr + 4 * uv_stride); |
611 } | 1137 } |
612 | 1138 |
613 /* Vertical MB Filtering */ | 1139 /* Vertical MB Filtering */ |
614 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, | 1140 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, |
615 unsigned char *v_ptr, int y_stride, int uv_stride, | 1141 unsigned char *v_ptr, int y_stride, int uv_stride, |
616 struct loop_filter_info *lfi) { | 1142 struct loop_filter_info *lfi) { |
617 vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, | 1143 vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, |
618 lfi->hev_thr); | 1144 lfi->hev_thr); |
619 | 1145 |
620 /* u,v */ | 1146 /* u,v */ |
621 if (u_ptr) | 1147 if (u_ptr) |
622 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, | 1148 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, |
623 lfi->lim, lfi->hev_thr, v_ptr); | 1149 lfi->lim, lfi->hev_thr, v_ptr); |
624 } | 1150 } |
625 | 1151 |
| 1152 |
| 1153 void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr, |
| 1154 unsigned char *v_ptr, int y_stride, int uv_stride, |
| 1155 struct loop_filter_info *lfi) { |
| 1156 vp9_mb_lpf_vertical_edge_w_sse2(y_ptr, y_stride, |
| 1157 lfi->mblim, lfi->lim, lfi->hev_thr); |
| 1158 |
| 1159 /* u,v */ |
| 1160 if (u_ptr) |
| 1161 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, |
| 1162 lfi->lim, lfi->hev_thr, v_ptr); |
| 1163 } |
| 1164 |
| 1165 |
626 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, | 1166 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, |
627 unsigned char *v_ptr, int y_stride, int uv_stride, | 1167 unsigned char *v_ptr, int y_stride, int uv_stride, |
628 struct loop_filter_info *lfi) { | 1168 struct loop_filter_info *lfi) { |
629 vp9_mbloop_filter_vertical_edge_sse2( | 1169 vp9_mbloop_filter_vertical_edge_sse2( |
630 y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); | 1170 y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); |
| 1171 |
| 1172 if (u_ptr) |
| 1173 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, |
| 1174 lfi->blim, lfi->lim, lfi->hev_thr, |
| 1175 v_ptr + 4); |
631 } | 1176 } |
632 | 1177 |
633 /* Horizontal B Filtering */ | 1178 /* Horizontal B Filtering */ |
634 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, | 1179 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, |
635 unsigned char *u_ptr, unsigned char *v_ptr, | 1180 unsigned char *u_ptr, unsigned char *v_ptr, |
636 int y_stride, int uv_stride, | 1181 int y_stride, int uv_stride, |
637 struct loop_filter_info *lfi) { | 1182 struct loop_filter_info *lfi) { |
638 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, | 1183 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, |
639 lfi->blim, lfi->lim, lfi->hev_thr, 2); | 1184 lfi->blim, lfi->lim, lfi->hev_thr, 2); |
640 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, | 1185 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
677 } | 1222 } |
678 | 1223 |
679 void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, | 1224 void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, |
680 const unsigned char *blimit) { | 1225 const unsigned char *blimit) { |
681 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); | 1226 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); |
682 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); | 1227 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); |
683 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); | 1228 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); |
684 } | 1229 } |
685 | 1230 |
686 #endif | 1231 #endif |
OLD | NEW |