OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 #include <immintrin.h> | |
11 #include "vpx_ports/mem.h" | |
12 | |
13 #define FSAD64_H(h) \ | |
14 unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \ | |
15 int src_stride, \ | |
16 const uint8_t *ref_ptr, \ | |
17 int ref_stride) { \ | |
18 int i, res; \ | |
19 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ | |
20 __m256i sum_sad = _mm256_setzero_si256(); \ | |
21 __m256i sum_sad_h; \ | |
22 __m128i sum_sad128; \ | |
23 for (i = 0 ; i < h ; i++) { \ | |
24 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ | |
25 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ | |
26 sad1_reg = _mm256_sad_epu8(ref1_reg, \ | |
27 _mm256_loadu_si256((__m256i const *)src_ptr)); \ | |
28 sad2_reg = _mm256_sad_epu8(ref2_reg, \ | |
29 _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ | |
30 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ | |
31 ref_ptr+= ref_stride; \ | |
32 src_ptr+= src_stride; \ | |
33 } \ | |
34 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ | |
35 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ | |
36 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ | |
37 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ | |
38 res = _mm_cvtsi128_si32(sum_sad128); \ | |
39 return res; \ | |
40 } | |
41 | |
42 #define FSAD32_H(h) \ | |
43 unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \ | |
44 int src_stride, \ | |
45 const uint8_t *ref_ptr, \ | |
46 int ref_stride) { \ | |
47 int i, res; \ | |
48 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ | |
49 __m256i sum_sad = _mm256_setzero_si256(); \ | |
50 __m256i sum_sad_h; \ | |
51 __m128i sum_sad128; \ | |
52 int ref2_stride = ref_stride << 1; \ | |
53 int src2_stride = src_stride << 1; \ | |
54 int max = h >> 1; \ | |
55 for (i = 0 ; i < max ; i++) { \ | |
56 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ | |
57 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ | |
58 sad1_reg = _mm256_sad_epu8(ref1_reg, \ | |
59 _mm256_loadu_si256((__m256i const *)src_ptr)); \ | |
60 sad2_reg = _mm256_sad_epu8(ref2_reg, \ | |
61 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ | |
62 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ | |
63 ref_ptr+= ref2_stride; \ | |
64 src_ptr+= src2_stride; \ | |
65 } \ | |
66 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ | |
67 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ | |
68 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ | |
69 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ | |
70 res = _mm_cvtsi128_si32(sum_sad128); \ | |
71 return res; \ | |
72 } | |
73 | |
74 #define FSAD64 \ | |
75 FSAD64_H(64); \ | |
76 FSAD64_H(32); | |
77 | |
78 #define FSAD32 \ | |
79 FSAD32_H(64); \ | |
80 FSAD32_H(32); \ | |
81 FSAD32_H(16); | |
82 | |
83 FSAD64; | |
84 FSAD32; | |
85 | |
86 #undef FSAD64 | |
87 #undef FSAD32 | |
88 #undef FSAD64_H | |
89 #undef FSAD32_H | |
90 | |
91 #define FSADAVG64_H(h) \ | |
92 unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ | |
93 int src_stride, \ | |
94 const uint8_t *ref_ptr, \ | |
95 int ref_stride, \ | |
96 const uint8_t *second_pred) { \ | |
97 int i, res; \ | |
98 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ | |
99 __m256i sum_sad = _mm256_setzero_si256(); \ | |
100 __m256i sum_sad_h; \ | |
101 __m128i sum_sad128; \ | |
102 for (i = 0 ; i < h ; i++) { \ | |
103 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ | |
104 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ | |
105 ref1_reg = _mm256_avg_epu8(ref1_reg, \ | |
106 _mm256_loadu_si256((__m256i const *)second_pred)); \ | |
107 ref2_reg = _mm256_avg_epu8(ref2_reg, \ | |
108 _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ | |
109 sad1_reg = _mm256_sad_epu8(ref1_reg, \ | |
110 _mm256_loadu_si256((__m256i const *)src_ptr)); \ | |
111 sad2_reg = _mm256_sad_epu8(ref2_reg, \ | |
112 _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ | |
113 sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ | |
114 ref_ptr+= ref_stride; \ | |
115 src_ptr+= src_stride; \ | |
116 second_pred+= 64; \ | |
117 } \ | |
118 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ | |
119 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ | |
120 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ | |
121 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ | |
122 res = _mm_cvtsi128_si32(sum_sad128); \ | |
123 return res; \ | |
124 } | |
125 | |
126 #define FSADAVG32_H(h) \ | |
127 unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ | |
128 int src_stride, \ | |
129 const uint8_t *ref_ptr, \ | |
130 int ref_stride, \ | |
131 const uint8_t *second_pred) { \ | |
132 int i, res; \ | |
133 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ | |
134 __m256i sum_sad = _mm256_setzero_si256(); \ | |
135 __m256i sum_sad_h; \ | |
136 __m128i sum_sad128; \ | |
137 int ref2_stride = ref_stride << 1; \ | |
138 int src2_stride = src_stride << 1; \ | |
139 int max = h >> 1; \ | |
140 for (i = 0 ; i < max ; i++) { \ | |
141 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ | |
142 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ | |
143 ref1_reg = _mm256_avg_epu8(ref1_reg, \ | |
144 _mm256_loadu_si256((__m256i const *)second_pred)); \ | |
145 ref2_reg = _mm256_avg_epu8(ref2_reg, \ | |
146 _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ | |
147 sad1_reg = _mm256_sad_epu8(ref1_reg, \ | |
148 _mm256_loadu_si256((__m256i const *)src_ptr)); \ | |
149 sad2_reg = _mm256_sad_epu8(ref2_reg, \ | |
150 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ | |
151 sum_sad = _mm256_add_epi32(sum_sad, \ | |
152 _mm256_add_epi32(sad1_reg, sad2_reg)); \ | |
153 ref_ptr+= ref2_stride; \ | |
154 src_ptr+= src2_stride; \ | |
155 second_pred+= 64; \ | |
156 } \ | |
157 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ | |
158 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ | |
159 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ | |
160 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ | |
161 res = _mm_cvtsi128_si32(sum_sad128); \ | |
162 return res; \ | |
163 } | |
164 | |
165 #define FSADAVG64 \ | |
166 FSADAVG64_H(64); \ | |
167 FSADAVG64_H(32); | |
168 | |
169 #define FSADAVG32 \ | |
170 FSADAVG32_H(64); \ | |
171 FSADAVG32_H(32); \ | |
172 FSADAVG32_H(16); | |
173 | |
174 FSADAVG64; | |
175 FSADAVG32; | |
176 | |
177 #undef FSADAVG64 | |
178 #undef FSADAVG32 | |
179 #undef FSADAVG64_H | |
180 #undef FSADAVG32_H | |
OLD | NEW |