OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <immintrin.h> // AVX2 | 11 #include <immintrin.h> // AVX2 |
12 | 12 |
13 void vp9_get16x16var_avx2(const unsigned char *src_ptr, | 13 #include "./vpx_dsp_rtcd.h" |
| 14 |
| 15 void vpx_get16x16var_avx2(const unsigned char *src_ptr, |
14 int source_stride, | 16 int source_stride, |
15 const unsigned char *ref_ptr, | 17 const unsigned char *ref_ptr, |
16 int recon_stride, | 18 int recon_stride, |
17 unsigned int *SSE, | 19 unsigned int *SSE, |
18 int *Sum) { | 20 int *Sum) { |
19 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; | 21 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; |
20 __m256i ref_expand_high, madd_low, madd_high; | 22 __m256i ref_expand_high, madd_low, madd_high; |
21 unsigned int i, src_2strides, ref_2strides; | 23 unsigned int i, src_2strides, ref_2strides; |
22 __m256i zero_reg = _mm256_set1_epi16(0); | 24 __m256i zero_reg = _mm256_set1_epi16(0); |
23 __m256i sum_ref_src = _mm256_set1_epi16(0); | 25 __m256i sum_ref_src = _mm256_set1_epi16(0); |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
114 | 116 |
115 madd_res = _mm_add_epi32(madd_res, expand_madd); | 117 madd_res = _mm_add_epi32(madd_res, expand_madd); |
116 sum_res = _mm_add_epi32(sum_res, ex_expand_sum); | 118 sum_res = _mm_add_epi32(sum_res, ex_expand_sum); |
117 | 119 |
118 *((int*)SSE)= _mm_cvtsi128_si32(madd_res); | 120 *((int*)SSE)= _mm_cvtsi128_si32(madd_res); |
119 | 121 |
120 *((int*)Sum)= _mm_cvtsi128_si32(sum_res); | 122 *((int*)Sum)= _mm_cvtsi128_si32(sum_res); |
121 } | 123 } |
122 } | 124 } |
123 | 125 |
124 void vp9_get32x32var_avx2(const unsigned char *src_ptr, | 126 void vpx_get32x32var_avx2(const unsigned char *src_ptr, |
125 int source_stride, | 127 int source_stride, |
126 const unsigned char *ref_ptr, | 128 const unsigned char *ref_ptr, |
127 int recon_stride, | 129 int recon_stride, |
128 unsigned int *SSE, | 130 unsigned int *SSE, |
129 int *Sum) { | 131 int *Sum) { |
130 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; | 132 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; |
131 __m256i ref_expand_high, madd_low, madd_high; | 133 __m256i ref_expand_high, madd_low, madd_high; |
132 unsigned int i; | 134 unsigned int i; |
133 __m256i zero_reg = _mm256_set1_epi16(0); | 135 __m256i zero_reg = _mm256_set1_epi16(0); |
134 __m256i sum_ref_src = _mm256_set1_epi16(0); | 136 __m256i sum_ref_src = _mm256_set1_epi16(0); |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
204 sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); | 206 sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); |
205 | 207 |
206 // extract the low lane and the high lane and add the results | 208 // extract the low lane and the high lane and add the results |
207 *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + | 209 *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + |
208 _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); | 210 _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); |
209 | 211 |
210 *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + | 212 *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + |
211 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); | 213 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); |
212 } | 214 } |
213 } | 215 } |
OLD | NEW |