| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <immintrin.h> // AVX2 | 11 #include <immintrin.h> // AVX2 |
| 12 | 12 |
| 13 void vp9_get16x16var_avx2(const unsigned char *src_ptr, | 13 #include "./vpx_dsp_rtcd.h" |
| 14 |
| 15 void vpx_get16x16var_avx2(const unsigned char *src_ptr, |
| 14 int source_stride, | 16 int source_stride, |
| 15 const unsigned char *ref_ptr, | 17 const unsigned char *ref_ptr, |
| 16 int recon_stride, | 18 int recon_stride, |
| 17 unsigned int *SSE, | 19 unsigned int *SSE, |
| 18 int *Sum) { | 20 int *Sum) { |
| 19 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; | 21 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; |
| 20 __m256i ref_expand_high, madd_low, madd_high; | 22 __m256i ref_expand_high, madd_low, madd_high; |
| 21 unsigned int i, src_2strides, ref_2strides; | 23 unsigned int i, src_2strides, ref_2strides; |
| 22 __m256i zero_reg = _mm256_set1_epi16(0); | 24 __m256i zero_reg = _mm256_set1_epi16(0); |
| 23 __m256i sum_ref_src = _mm256_set1_epi16(0); | 25 __m256i sum_ref_src = _mm256_set1_epi16(0); |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 114 | 116 |
| 115 madd_res = _mm_add_epi32(madd_res, expand_madd); | 117 madd_res = _mm_add_epi32(madd_res, expand_madd); |
| 116 sum_res = _mm_add_epi32(sum_res, ex_expand_sum); | 118 sum_res = _mm_add_epi32(sum_res, ex_expand_sum); |
| 117 | 119 |
| 118 *((int*)SSE)= _mm_cvtsi128_si32(madd_res); | 120 *((int*)SSE)= _mm_cvtsi128_si32(madd_res); |
| 119 | 121 |
| 120 *((int*)Sum)= _mm_cvtsi128_si32(sum_res); | 122 *((int*)Sum)= _mm_cvtsi128_si32(sum_res); |
| 121 } | 123 } |
| 122 } | 124 } |
| 123 | 125 |
| 124 void vp9_get32x32var_avx2(const unsigned char *src_ptr, | 126 void vpx_get32x32var_avx2(const unsigned char *src_ptr, |
| 125 int source_stride, | 127 int source_stride, |
| 126 const unsigned char *ref_ptr, | 128 const unsigned char *ref_ptr, |
| 127 int recon_stride, | 129 int recon_stride, |
| 128 unsigned int *SSE, | 130 unsigned int *SSE, |
| 129 int *Sum) { | 131 int *Sum) { |
| 130 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; | 132 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; |
| 131 __m256i ref_expand_high, madd_low, madd_high; | 133 __m256i ref_expand_high, madd_low, madd_high; |
| 132 unsigned int i; | 134 unsigned int i; |
| 133 __m256i zero_reg = _mm256_set1_epi16(0); | 135 __m256i zero_reg = _mm256_set1_epi16(0); |
| 134 __m256i sum_ref_src = _mm256_set1_epi16(0); | 136 __m256i sum_ref_src = _mm256_set1_epi16(0); |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 204 sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); | 206 sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); |
| 205 | 207 |
| 206 // extract the low lane and the high lane and add the results | 208 // extract the low lane and the high lane and add the results |
| 207 *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + | 209 *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + |
| 208 _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); | 210 _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); |
| 209 | 211 |
| 210 *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + | 212 *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + |
| 211 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); | 213 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); |
| 212 } | 214 } |
| 213 } | 215 } |
| OLD | NEW |