OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 #include <immintrin.h> // AVX2 | 10 #include <immintrin.h> // AVX2 |
(...skipping 13 matching lines...) Expand all Loading... |
24 ref0 = ref[0]; | 24 ref0 = ref[0]; |
25 ref1 = ref[1]; | 25 ref1 = ref[1]; |
26 ref2 = ref[2]; | 26 ref2 = ref[2]; |
27 ref3 = ref[3]; | 27 ref3 = ref[3]; |
28 sum_ref0 = _mm256_set1_epi16(0); | 28 sum_ref0 = _mm256_set1_epi16(0); |
29 sum_ref1 = _mm256_set1_epi16(0); | 29 sum_ref1 = _mm256_set1_epi16(0); |
30 sum_ref2 = _mm256_set1_epi16(0); | 30 sum_ref2 = _mm256_set1_epi16(0); |
31 sum_ref3 = _mm256_set1_epi16(0); | 31 sum_ref3 = _mm256_set1_epi16(0); |
32 for (i = 0; i < 32 ; i++) { | 32 for (i = 0; i < 32 ; i++) { |
33 // load src and all refs | 33 // load src and all refs |
34 src_reg = _mm256_load_si256((__m256i *)(src)); | 34 src_reg = _mm256_loadu_si256((__m256i *)(src)); |
35 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); | 35 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); |
36 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); | 36 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); |
37 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); | 37 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); |
38 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); | 38 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); |
39 // sum of the absolute differences between every ref-i to src | 39 // sum of the absolute differences between every ref-i to src |
40 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); | 40 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); |
41 ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); | 41 ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); |
42 ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); | 42 ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); |
43 ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); | 43 ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); |
44 // sum every ref-i | 44 // sum every ref-i |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
96 ref0 = ref[0]; | 96 ref0 = ref[0]; |
97 ref1 = ref[1]; | 97 ref1 = ref[1]; |
98 ref2 = ref[2]; | 98 ref2 = ref[2]; |
99 ref3 = ref[3]; | 99 ref3 = ref[3]; |
100 sum_ref0 = _mm256_set1_epi16(0); | 100 sum_ref0 = _mm256_set1_epi16(0); |
101 sum_ref1 = _mm256_set1_epi16(0); | 101 sum_ref1 = _mm256_set1_epi16(0); |
102 sum_ref2 = _mm256_set1_epi16(0); | 102 sum_ref2 = _mm256_set1_epi16(0); |
103 sum_ref3 = _mm256_set1_epi16(0); | 103 sum_ref3 = _mm256_set1_epi16(0); |
104 for (i = 0; i < 64 ; i++) { | 104 for (i = 0; i < 64 ; i++) { |
105 // load 64 bytes from src and all refs | 105 // load 64 bytes from src and all refs |
106 src_reg = _mm256_load_si256((__m256i *)(src)); | 106 src_reg = _mm256_loadu_si256((__m256i *)(src)); |
107 srcnext_reg = _mm256_load_si256((__m256i *)(src + 32)); | 107 srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); |
108 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); | 108 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); |
109 ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); | 109 ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); |
110 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); | 110 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); |
111 ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); | 111 ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); |
112 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); | 112 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); |
113 ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); | 113 ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); |
114 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); | 114 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); |
115 ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); | 115 ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); |
116 // sum of the absolute differences between every ref-i to src | 116 // sum of the absolute differences between every ref-i to src |
117 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); | 117 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
158 // add the low 64 bit to the high 64 bit | 158 // add the low 64 bit to the high 64 bit |
159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); | 159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); |
160 | 160 |
161 // add the low 128 bit to the high 128 bit | 161 // add the low 128 bit to the high 128 bit |
162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), | 162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), |
163 _mm256_extractf128_si256(sum_mlow, 1)); | 163 _mm256_extractf128_si256(sum_mlow, 1)); |
164 | 164 |
165 _mm_storeu_si128((__m128i *)(res), sum); | 165 _mm_storeu_si128((__m128i *)(res), sum); |
166 } | 166 } |
167 } | 167 } |
OLD | NEW |