| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 #include <immintrin.h> // AVX2 | 10 #include <immintrin.h> // AVX2 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 24 ref0 = ref[0]; | 24 ref0 = ref[0]; |
| 25 ref1 = ref[1]; | 25 ref1 = ref[1]; |
| 26 ref2 = ref[2]; | 26 ref2 = ref[2]; |
| 27 ref3 = ref[3]; | 27 ref3 = ref[3]; |
| 28 sum_ref0 = _mm256_set1_epi16(0); | 28 sum_ref0 = _mm256_set1_epi16(0); |
| 29 sum_ref1 = _mm256_set1_epi16(0); | 29 sum_ref1 = _mm256_set1_epi16(0); |
| 30 sum_ref2 = _mm256_set1_epi16(0); | 30 sum_ref2 = _mm256_set1_epi16(0); |
| 31 sum_ref3 = _mm256_set1_epi16(0); | 31 sum_ref3 = _mm256_set1_epi16(0); |
| 32 for (i = 0; i < 32 ; i++) { | 32 for (i = 0; i < 32 ; i++) { |
| 33 // load src and all refs | 33 // load src and all refs |
| 34 src_reg = _mm256_load_si256((__m256i *)(src)); | 34 src_reg = _mm256_loadu_si256((__m256i *)(src)); |
| 35 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); | 35 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); |
| 36 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); | 36 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); |
| 37 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); | 37 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); |
| 38 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); | 38 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); |
| 39 // sum of the absolute differences between every ref-i to src | 39 // sum of the absolute differences between every ref-i to src |
| 40 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); | 40 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); |
| 41 ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); | 41 ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); |
| 42 ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); | 42 ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); |
| 43 ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); | 43 ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); |
| 44 // sum every ref-i | 44 // sum every ref-i |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 96 ref0 = ref[0]; | 96 ref0 = ref[0]; |
| 97 ref1 = ref[1]; | 97 ref1 = ref[1]; |
| 98 ref2 = ref[2]; | 98 ref2 = ref[2]; |
| 99 ref3 = ref[3]; | 99 ref3 = ref[3]; |
| 100 sum_ref0 = _mm256_set1_epi16(0); | 100 sum_ref0 = _mm256_set1_epi16(0); |
| 101 sum_ref1 = _mm256_set1_epi16(0); | 101 sum_ref1 = _mm256_set1_epi16(0); |
| 102 sum_ref2 = _mm256_set1_epi16(0); | 102 sum_ref2 = _mm256_set1_epi16(0); |
| 103 sum_ref3 = _mm256_set1_epi16(0); | 103 sum_ref3 = _mm256_set1_epi16(0); |
| 104 for (i = 0; i < 64 ; i++) { | 104 for (i = 0; i < 64 ; i++) { |
| 105 // load 64 bytes from src and all refs | 105 // load 64 bytes from src and all refs |
| 106 src_reg = _mm256_load_si256((__m256i *)(src)); | 106 src_reg = _mm256_loadu_si256((__m256i *)(src)); |
| 107 srcnext_reg = _mm256_load_si256((__m256i *)(src + 32)); | 107 srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); |
| 108 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); | 108 ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); |
| 109 ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); | 109 ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); |
| 110 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); | 110 ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); |
| 111 ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); | 111 ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); |
| 112 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); | 112 ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); |
| 113 ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); | 113 ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); |
| 114 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); | 114 ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); |
| 115 ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); | 115 ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); |
| 116 // sum of the absolute differences between every ref-i to src | 116 // sum of the absolute differences between every ref-i to src |
| 117 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); | 117 ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 158 // add the low 64 bit to the high 64 bit | 158 // add the low 64 bit to the high 64 bit |
| 159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); | 159 sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); |
| 160 | 160 |
| 161 // add the low 128 bit to the high 128 bit | 161 // add the low 128 bit to the high 128 bit |
| 162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), | 162 sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), |
| 163 _mm256_extractf128_si256(sum_mlow, 1)); | 163 _mm256_extractf128_si256(sum_mlow, 1)); |
| 164 | 164 |
| 165 _mm_storeu_si128((__m128i *)(res), sum); | 165 _mm_storeu_si128((__m128i *)(res), sum); |
| 166 } | 166 } |
| 167 } | 167 } |
| OLD | NEW |