OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "vp8/encoder/denoising.h" | 11 #include "vp8/encoder/denoising.h" |
12 #include "vp8/common/reconinter.h" | 12 #include "vp8/common/reconinter.h" |
13 #include "vpx/vpx_integer.h" | 13 #include "vpx/vpx_integer.h" |
14 #include "vpx_mem/vpx_mem.h" | 14 #include "vpx_mem/vpx_mem.h" |
15 #include "vp8_rtcd.h" | 15 #include "vp8_rtcd.h" |
16 | 16 |
17 #include <emmintrin.h> | 17 #include <emmintrin.h> |
18 #include "vpx_ports/emmintrin_compat.h" | 18 #include "vpx_ports/emmintrin_compat.h" |
19 | 19 |
20 union sum_union { | 20 /* Compute the sum of all pixel differences of this MB. */ |
21 __m128i v; | 21 static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { |
22 signed char e[16]; | 22 const __m128i k_1 = _mm_set1_epi16(1); |
23 }; | 23 const __m128i acc_diff_lo = _mm_srai_epi16( |
| 24 _mm_unpacklo_epi8(acc_diff, acc_diff), 8); |
| 25 const __m128i acc_diff_hi = _mm_srai_epi16( |
| 26 _mm_unpackhi_epi8(acc_diff, acc_diff), 8); |
| 27 const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); |
| 28 const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); |
| 29 const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, |
| 30 _mm_srli_si128(hg_fe_dc_ba, 8)); |
| 31 const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, |
| 32 _mm_srli_si128(hgfe_dcba, 4)); |
| 33 unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba); |
| 34 |
| 35 return abs(sum_diff); |
| 36 } |
24 | 37 |
25 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, | 38 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, |
26 int mc_avg_y_stride, | 39 int mc_avg_y_stride, |
27 unsigned char *running_avg_y, int avg_y_stride, | 40 unsigned char *running_avg_y, int avg_y_stride, |
28 unsigned char *sig, int sig_stride, | 41 unsigned char *sig, int sig_stride, |
29 unsigned int motion_magnitude, | 42 unsigned int motion_magnitude, |
30 int increase_denoising) | 43 int increase_denoising) |
31 { | 44 { |
32 unsigned char *running_avg_y_start = running_avg_y; | 45 unsigned char *running_avg_y_start = running_avg_y; |
33 unsigned char *sig_start = sig; | 46 unsigned char *sig_start = sig; |
34 int sum_diff_thresh; | 47 unsigned int sum_diff_thresh; |
35 int r; | 48 int r; |
36 int shift_inc = (increase_denoising && | 49 int shift_inc = (increase_denoising && |
37 motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; | 50 motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; |
38 __m128i acc_diff = _mm_setzero_si128(); | 51 __m128i acc_diff = _mm_setzero_si128(); |
39 const __m128i k_0 = _mm_setzero_si128(); | 52 const __m128i k_0 = _mm_setzero_si128(); |
40 const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); | 53 const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); |
41 const __m128i k_8 = _mm_set1_epi8(8); | 54 const __m128i k_8 = _mm_set1_epi8(8); |
42 const __m128i k_16 = _mm_set1_epi8(16); | 55 const __m128i k_16 = _mm_set1_epi8(16); |
43 /* Modify each level's adjustment according to motion_magnitude. */ | 56 /* Modify each level's adjustment according to motion_magnitude. */ |
44 const __m128i l3 = _mm_set1_epi8( | 57 const __m128i l3 = _mm_set1_epi8( |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
96 acc_diff = _mm_subs_epi8(acc_diff, nadj); | 109 acc_diff = _mm_subs_epi8(acc_diff, nadj); |
97 | 110 |
98 /* Update pointers for next iteration. */ | 111 /* Update pointers for next iteration. */ |
99 sig += sig_stride; | 112 sig += sig_stride; |
100 mc_running_avg_y += mc_avg_y_stride; | 113 mc_running_avg_y += mc_avg_y_stride; |
101 running_avg_y += avg_y_stride; | 114 running_avg_y += avg_y_stride; |
102 } | 115 } |
103 | 116 |
104 { | 117 { |
105 /* Compute the sum of all pixel differences of this MB. */ | 118 /* Compute the sum of all pixel differences of this MB. */ |
106 union sum_union s; | 119 unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); |
107 int sum_diff = 0; | |
108 s.v = acc_diff; | |
109 sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] | |
110 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] | |
111 + s.e[12] + s.e[13] + s.e[14] + s.e[15]; | |
112 | |
113 sum_diff_thresh = SUM_DIFF_THRESHOLD; | 120 sum_diff_thresh = SUM_DIFF_THRESHOLD; |
114 if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; | 121 if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; |
115 if (abs(sum_diff) > sum_diff_thresh) { | 122 if (abs_sum_diff > sum_diff_thresh) { |
116 // Before returning to copy the block (i.e., apply no denoising), | 123 // Before returning to copy the block (i.e., apply no denoising), |
117 // checK if we can still apply some (weaker) temporal filtering to | 124 // checK if we can still apply some (weaker) temporal filtering to |
118 // this block, that would otherwise not be denoised at all. Simplest | 125 // this block, that would otherwise not be denoised at all. Simplest |
119 // is to apply an additional adjustment to running_avg_y to bring it | 126 // is to apply an additional adjustment to running_avg_y to bring it |
120 // closer to sig. The adjustment is capped by a maximum delta, and | 127 // closer to sig. The adjustment is capped by a maximum delta, and |
121 // chosen such that in most cases the resulting sum_diff will be | 128 // chosen such that in most cases the resulting sum_diff will be |
122 // within the accceptable range given by sum_diff_thresh. | 129 // within the accceptable range given by sum_diff_thresh. |
123 | 130 |
124 // The delta is set by the excess of absolute pixel diff over the | 131 // The delta is set by the excess of absolute pixel diff over the |
125 // threshold. | 132 // threshold. |
126 int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; | 133 int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; |
127 // Only apply the adjustment for max delta up to 3. | 134 // Only apply the adjustment for max delta up to 3. |
128 if (delta < 4) { | 135 if (delta < 4) { |
129 const __m128i k_delta = _mm_set1_epi8(delta); | 136 const __m128i k_delta = _mm_set1_epi8(delta); |
130 sig -= sig_stride * 16; | 137 sig -= sig_stride * 16; |
131 mc_running_avg_y -= mc_avg_y_stride * 16; | 138 mc_running_avg_y -= mc_avg_y_stride * 16; |
132 running_avg_y -= avg_y_stride * 16; | 139 running_avg_y -= avg_y_stride * 16; |
133 for (r = 0; r < 16; ++r) { | 140 for (r = 0; r < 16; ++r) { |
134 __m128i v_running_avg_y = | 141 __m128i v_running_avg_y = |
135 _mm_loadu_si128((__m128i *)(&running_avg_y[0])); | 142 _mm_loadu_si128((__m128i *)(&running_avg_y[0])); |
136 // Calculate differences. | 143 // Calculate differences. |
(...skipping 18 matching lines...) Expand all Loading... |
155 | 162 |
156 // Accumulate the adjustments. | 163 // Accumulate the adjustments. |
157 acc_diff = _mm_subs_epi8(acc_diff, padj); | 164 acc_diff = _mm_subs_epi8(acc_diff, padj); |
158 acc_diff = _mm_adds_epi8(acc_diff, nadj); | 165 acc_diff = _mm_adds_epi8(acc_diff, nadj); |
159 | 166 |
160 // Update pointers for next iteration. | 167 // Update pointers for next iteration. |
161 sig += sig_stride; | 168 sig += sig_stride; |
162 mc_running_avg_y += mc_avg_y_stride; | 169 mc_running_avg_y += mc_avg_y_stride; |
163 running_avg_y += avg_y_stride; | 170 running_avg_y += avg_y_stride; |
164 } | 171 } |
165 { | 172 abs_sum_diff = abs_sum_diff_16x1(acc_diff); |
166 // Update the sum of all pixel differences of this MB. | 173 if (abs_sum_diff > sum_diff_thresh) { |
167 union sum_union s; | 174 return COPY_BLOCK; |
168 s.v = acc_diff; | |
169 sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] | |
170 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] | |
171 + s.e[12] + s.e[13] + s.e[14] + s.e[15]; | |
172 if (abs(sum_diff) > sum_diff_thresh) { | |
173 return COPY_BLOCK; | |
174 } | |
175 } | 175 } |
176 } else { | 176 } else { |
177 return COPY_BLOCK; | 177 return COPY_BLOCK; |
178 } | 178 } |
179 } | 179 } |
180 } | 180 } |
181 | 181 |
182 vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); | 182 vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); |
183 return FILTER_BLOCK; | 183 return FILTER_BLOCK; |
184 } | 184 } |
| 185 |
| 186 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, |
| 187 int mc_avg_stride, |
| 188 unsigned char *running_avg, int avg_stride, |
| 189 unsigned char *sig, int sig_stride, |
| 190 unsigned int motion_magnitude, |
| 191 int increase_denoising) { |
| 192 unsigned char *running_avg_start = running_avg; |
| 193 unsigned char *sig_start = sig; |
| 194 unsigned int sum_diff_thresh; |
| 195 int r; |
| 196 int shift_inc = (increase_denoising && |
| 197 motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; |
| 198 __m128i acc_diff = _mm_setzero_si128(); |
| 199 const __m128i k_0 = _mm_setzero_si128(); |
| 200 const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); |
| 201 const __m128i k_8 = _mm_set1_epi8(8); |
| 202 const __m128i k_16 = _mm_set1_epi8(16); |
| 203 /* Modify each level's adjustment according to motion_magnitude. */ |
| 204 const __m128i l3 = _mm_set1_epi8( |
| 205 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? |
| 206 7 + shift_inc : 6); |
| 207 /* Difference between level 3 and level 2 is 2. */ |
| 208 const __m128i l32 = _mm_set1_epi8(2); |
| 209 /* Difference between level 2 and level 1 is 1. */ |
| 210 const __m128i l21 = _mm_set1_epi8(1); |
| 211 |
| 212 { |
| 213 const __m128i k_1 = _mm_set1_epi16(1); |
| 214 __m128i vec_sum_block = _mm_setzero_si128(); |
| 215 |
| 216 // Avoid denoising color signal if its close to average level. |
| 217 for (r = 0; r < 8; ++r) { |
| 218 const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); |
| 219 const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); |
| 220 vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); |
| 221 sig += sig_stride; |
| 222 } |
| 223 sig -= sig_stride * 8; |
| 224 { |
| 225 const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); |
| 226 const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, |
| 227 _mm_srli_si128(hg_fe_dc_ba, 8)); |
| 228 const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, |
| 229 _mm_srli_si128(hgfe_dcba, 4)); |
| 230 const int sum_block = _mm_cvtsi128_si32(hgfedcba); |
| 231 if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { |
| 232 return COPY_BLOCK; |
| 233 } |
| 234 } |
| 235 } |
| 236 |
| 237 for (r = 0; r < 4; ++r) { |
| 238 /* Calculate differences */ |
| 239 const __m128i v_sig_low = _mm_castpd_si128( |
| 240 _mm_load_sd((double *)(&sig[0]))); |
| 241 const __m128i v_sig = _mm_castpd_si128( |
| 242 _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), |
| 243 (double *)(&sig[sig_stride]))); |
| 244 const __m128i v_mc_running_avg_low = _mm_castpd_si128( |
| 245 _mm_load_sd((double *)(&mc_running_avg[0]))); |
| 246 const __m128i v_mc_running_avg = _mm_castpd_si128( |
| 247 _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), |
| 248 (double *)(&mc_running_avg[mc_avg_stride]))); |
| 249 const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); |
| 250 const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); |
| 251 /* Obtain the sign. FF if diff is negative. */ |
| 252 const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); |
| 253 /* Clamp absolute difference to 16 to be used to get mask. Doing this |
| 254 * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ |
| 255 const __m128i clamped_absdiff = _mm_min_epu8( |
| 256 _mm_or_si128(pdiff, ndiff), k_16); |
| 257 /* Get masks for l2 l1 and l0 adjustments */ |
| 258 const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); |
| 259 const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); |
| 260 const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); |
| 261 /* Get adjustments for l2, l1, and l0 */ |
| 262 __m128i adj2 = _mm_and_si128(mask2, l32); |
| 263 const __m128i adj1 = _mm_and_si128(mask1, l21); |
| 264 const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); |
| 265 __m128i adj, padj, nadj; |
| 266 __m128i v_running_avg; |
| 267 |
| 268 /* Combine the adjustments and get absolute adjustments. */ |
| 269 adj2 = _mm_add_epi8(adj2, adj1); |
| 270 adj = _mm_sub_epi8(l3, adj2); |
| 271 adj = _mm_andnot_si128(mask0, adj); |
| 272 adj = _mm_or_si128(adj, adj0); |
| 273 |
| 274 /* Restore the sign and get positive and negative adjustments. */ |
| 275 padj = _mm_andnot_si128(diff_sign, adj); |
| 276 nadj = _mm_and_si128(diff_sign, adj); |
| 277 |
| 278 /* Calculate filtered value. */ |
| 279 v_running_avg = _mm_adds_epu8(v_sig, padj); |
| 280 v_running_avg = _mm_subs_epu8(v_running_avg, nadj); |
| 281 |
| 282 _mm_storel_pd((double *)&running_avg[0], |
| 283 _mm_castsi128_pd(v_running_avg)); |
| 284 _mm_storeh_pd((double *)&running_avg[avg_stride], |
| 285 _mm_castsi128_pd(v_running_avg)); |
| 286 |
| 287 /* Adjustments <=7, and each element in acc_diff can fit in signed |
| 288 * char. |
| 289 */ |
| 290 acc_diff = _mm_adds_epi8(acc_diff, padj); |
| 291 acc_diff = _mm_subs_epi8(acc_diff, nadj); |
| 292 |
| 293 /* Update pointers for next iteration. */ |
| 294 sig += sig_stride * 2; |
| 295 mc_running_avg += mc_avg_stride * 2; |
| 296 running_avg += avg_stride * 2; |
| 297 } |
| 298 |
| 299 { |
| 300 unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); |
| 301 sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; |
| 302 if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; |
| 303 if (abs_sum_diff > sum_diff_thresh) { |
| 304 // Before returning to copy the block (i.e., apply no denoising), |
| 305 // checK if we can still apply some (weaker) temporal filtering to |
| 306 // this block, that would otherwise not be denoised at all. Simplest |
| 307 // is to apply an additional adjustment to running_avg_y to bring it |
| 308 // closer to sig. The adjustment is capped by a maximum delta, and |
| 309 // chosen such that in most cases the resulting sum_diff will be |
| 310 // within the accceptable range given by sum_diff_thresh. |
| 311 |
| 312 // The delta is set by the excess of absolute pixel diff over the |
| 313 // threshold. |
| 314 int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; |
| 315 // Only apply the adjustment for max delta up to 3. |
| 316 if (delta < 4) { |
| 317 const __m128i k_delta = _mm_set1_epi8(delta); |
| 318 sig -= sig_stride * 8; |
| 319 mc_running_avg -= mc_avg_stride * 8; |
| 320 running_avg -= avg_stride * 8; |
| 321 for (r = 0; r < 4; ++r) { |
| 322 // Calculate differences. |
| 323 const __m128i v_sig_low = _mm_castpd_si128( |
| 324 _mm_load_sd((double *)(&sig[0]))); |
| 325 const __m128i v_sig = _mm_castpd_si128( |
| 326 _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), |
| 327 (double *)(&sig[sig_stride]))); |
| 328 const __m128i v_mc_running_avg_low = _mm_castpd_si128( |
| 329 _mm_load_sd((double *)(&mc_running_avg[0]))); |
| 330 const __m128i v_mc_running_avg = _mm_castpd_si128( |
| 331 _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), |
| 332 (double *)(&mc_running_avg[mc_avg_stride]))); |
| 333 const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); |
| 334 const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); |
| 335 // Obtain the sign. FF if diff is negative. |
| 336 const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); |
| 337 // Clamp absolute difference to delta to get the adjustment. |
| 338 const __m128i adj = |
| 339 _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); |
| 340 // Restore the sign and get positive and negative adjustments. |
| 341 __m128i padj, nadj; |
| 342 const __m128i v_running_avg_low = _mm_castpd_si128( |
| 343 _mm_load_sd((double *)(&running_avg[0]))); |
| 344 __m128i v_running_avg = _mm_castpd_si128( |
| 345 _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), |
| 346 (double *)(&running_avg[avg_stride]))); |
| 347 padj = _mm_andnot_si128(diff_sign, adj); |
| 348 nadj = _mm_and_si128(diff_sign, adj); |
| 349 // Calculate filtered value. |
| 350 v_running_avg = _mm_subs_epu8(v_running_avg, padj); |
| 351 v_running_avg = _mm_adds_epu8(v_running_avg, nadj); |
| 352 |
| 353 _mm_storel_pd((double *)&running_avg[0], |
| 354 _mm_castsi128_pd(v_running_avg)); |
| 355 _mm_storeh_pd((double *)&running_avg[avg_stride], |
| 356 _mm_castsi128_pd(v_running_avg)); |
| 357 |
| 358 // Accumulate the adjustments. |
| 359 acc_diff = _mm_subs_epi8(acc_diff, padj); |
| 360 acc_diff = _mm_adds_epi8(acc_diff, nadj); |
| 361 |
| 362 // Update pointers for next iteration. |
| 363 sig += sig_stride * 2; |
| 364 mc_running_avg += mc_avg_stride * 2; |
| 365 running_avg += avg_stride * 2; |
| 366 } |
| 367 abs_sum_diff = abs_sum_diff_16x1(acc_diff); |
| 368 if (abs_sum_diff > sum_diff_thresh) { |
| 369 return COPY_BLOCK; |
| 370 } |
| 371 } else { |
| 372 return COPY_BLOCK; |
| 373 } |
| 374 } |
| 375 } |
| 376 |
| 377 vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); |
| 378 return FILTER_BLOCK; |
| 379 } |
OLD | NEW |