Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(448)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 #include "./vpx_config.h" 10 #include "./vpx_config.h"
11 11
12 #include "vp9/encoder/vp9_variance.h" 12 #include "vp9/encoder/vp9_variance.h"
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 typedef void (*get_var_avx2) ( 15 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
16 const unsigned char *src_ptr, 16 const uint8_t *ref, int ref_stride,
17 int source_stride, 17 unsigned int *sse, int *sum);
18 const unsigned char *ref_ptr,
19 int recon_stride,
20 unsigned int *SSE,
21 int *Sum
22 );
23 18
24 void vp9_get16x16var_avx2 19 void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
25 ( 20 const uint8_t *ref, int ref_stride,
26 const unsigned char *src_ptr, 21 unsigned int *sse, int *sum);
27 int source_stride,
28 const unsigned char *ref_ptr,
29 int recon_stride,
30 unsigned int *SSE,
31 int *Sum
32 );
33 22
34 void vp9_get32x32var_avx2 23 void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
35 ( 24 const uint8_t *ref, int ref_stride,
36 const unsigned char *src_ptr, 25 unsigned int *sse, int *sum);
37 int source_stride,
38 const unsigned char *ref_ptr,
39 int recon_stride,
40 unsigned int *SSE,
41 int *Sum
42 );
43 26
44 unsigned int vp9_sub_pixel_variance32xh_avx2 27 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
45 ( 28 int x_offset, int y_offset,
46 const uint8_t *src, 29 const uint8_t *dst, int dst_stride,
47 int src_stride, 30 int height,
48 int x_offset, 31 unsigned int *sse);
49 int y_offset,
50 const uint8_t *dst,
51 int dst_stride,
52 int height,
53 unsigned int *sse
54 );
55 32
56 unsigned int vp9_sub_pixel_avg_variance32xh_avx2 33 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
57 ( 34 int src_stride,
58 const uint8_t *src, 35 int x_offset,
59 int src_stride, 36 int y_offset,
60 int x_offset, 37 const uint8_t *dst,
61 int y_offset, 38 int dst_stride,
62 const uint8_t *dst, 39 const uint8_t *sec,
63 int dst_stride, 40 int sec_stride,
64 const uint8_t *sec, 41 int height,
65 int sec_stride, 42 unsigned int *sseptr);
66 int height,
67 unsigned int *sseptr
68 );
69 43
70 static void variance_avx2(const unsigned char *src_ptr, int source_stride, 44 static void variance_avx2(const uint8_t *src, int src_stride,
71 const unsigned char *ref_ptr, int recon_stride, 45 const uint8_t *ref, int ref_stride,
72 int w, int h, unsigned int *sse, int *sum, 46 int w, int h, unsigned int *sse, int *sum,
73 get_var_avx2 var_fn, int block_size) { 47 get_var_avx2 var_fn, int block_size) {
74 unsigned int sse0;
75 int sum0;
76 int i, j; 48 int i, j;
77 49
78 *sse = 0; 50 *sse = 0;
79 *sum = 0; 51 *sum = 0;
80 52
81 for (i = 0; i < h; i += 16) { 53 for (i = 0; i < h; i += 16) {
82 for (j = 0; j < w; j += block_size) { 54 for (j = 0; j < w; j += block_size) {
83 // processing 16 rows horizontally each call 55 unsigned int sse0;
84 var_fn(src_ptr + source_stride * i + j, source_stride, 56 int sum0;
85 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 57 var_fn(&src[src_stride * i + j], src_stride,
58 &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
86 *sse += sse0; 59 *sse += sse0;
87 *sum += sum0; 60 *sum += sum0;
88 } 61 }
89 } 62 }
90 } 63 }
91 64
92 unsigned int vp9_variance16x16_avx2
93 (
94 const unsigned char *src_ptr,
95 int source_stride,
96 const unsigned char *ref_ptr,
97 int recon_stride,
98 unsigned int *sse) {
99 unsigned int var;
100 int avg;
101 65
102 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 66 unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
103 &var, &avg, vp9_get16x16var_avx2, 16); 67 const uint8_t *ref, int ref_stride,
104 *sse = var; 68 unsigned int *sse) {
105 return (var - (((unsigned int)avg * avg) >> 8)); 69 int sum;
70 variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
71 sse, &sum, vp9_get16x16var_avx2, 16);
72 return *sse - (((unsigned int)sum * sum) >> 8);
106 } 73 }
107 74
108 unsigned int vp9_mse16x16_avx2( 75 unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
109 const unsigned char *src_ptr, 76 const uint8_t *ref, int ref_stride,
110 int source_stride, 77 unsigned int *sse) {
111 const unsigned char *ref_ptr, 78 int sum;
112 int recon_stride, 79 vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
113 unsigned int *sse) { 80 return *sse;
114 unsigned int sse0;
115 int sum0;
116 vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
117 &sum0);
118 *sse = sse0;
119 return sse0;
120 } 81 }
121 82
122 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, 83 unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
123 int source_stride, 84 const uint8_t *ref, int ref_stride,
124 const uint8_t *ref_ptr,
125 int recon_stride,
126 unsigned int *sse) { 85 unsigned int *sse) {
127 unsigned int var; 86 int sum;
128 int avg; 87 variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
129 88 sse, &sum, vp9_get32x32var_avx2, 32);
130 // processing 32 elements vertically in parallel 89 return *sse - (((int64_t)sum * sum) >> 9);
131 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
132 &var, &avg, vp9_get32x32var_avx2, 32);
133 *sse = var;
134 return (var - (((int64_t)avg * avg) >> 10));
135 } 90 }
136 91
137 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, 92 unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
138 int source_stride, 93 const uint8_t *ref, int ref_stride,
139 const uint8_t *ref_ptr,
140 int recon_stride,
141 unsigned int *sse) { 94 unsigned int *sse) {
142 unsigned int var; 95 int sum;
143 int avg; 96 variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
144 97 sse, &sum, vp9_get32x32var_avx2, 32);
145 // processing 32 elements vertically in parallel 98 return *sse - (((int64_t)sum * sum) >> 10);
146 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
147 &var, &avg, vp9_get32x32var_avx2, 32);
148 *sse = var;
149 return (var - (((int64_t)avg * avg) >> 9));
150 } 99 }
151 100
152 101 unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
153 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, 102 const uint8_t *ref, int ref_stride,
154 int source_stride,
155 const uint8_t *ref_ptr,
156 int recon_stride,
157 unsigned int *sse) { 103 unsigned int *sse) {
158 unsigned int var; 104 int sum;
159 int avg; 105 variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
160 106 sse, &sum, vp9_get32x32var_avx2, 32);
161 // processing 32 elements vertically in parallel 107 return *sse - (((int64_t)sum * sum) >> 12);
162 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
163 &var, &avg, vp9_get32x32var_avx2, 32);
164 *sse = var;
165 return (var - (((int64_t)avg * avg) >> 12));
166 } 108 }
167 109
168 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, 110 unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
169 int source_stride, 111 const uint8_t *ref, int ref_stride,
170 const uint8_t *ref_ptr,
171 int recon_stride,
172 unsigned int *sse) { 112 unsigned int *sse) {
173 unsigned int var; 113 int sum;
174 int avg; 114 variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
175 115 sse, &sum, vp9_get32x32var_avx2, 32);
176 // processing 32 elements vertically in parallel 116 return *sse - (((int64_t)sum * sum) >> 11);
177 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
178 &var, &avg, vp9_get32x32var_avx2, 32);
179
180 *sse = var;
181 return (var - (((int64_t)avg * avg) >> 11));
182 } 117 }
183 118
184 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, 119 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
185 int src_stride, 120 int src_stride,
186 int x_offset, 121 int x_offset,
187 int y_offset, 122 int y_offset,
188 const uint8_t *dst, 123 const uint8_t *dst,
189 int dst_stride, 124 int dst_stride,
190 unsigned int *sse_ptr) { 125 unsigned int *sse) {
191 // processing 32 elements in parallel 126 unsigned int sse1;
192 unsigned int sse; 127 const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
193 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 128 y_offset, dst, dst_stride,
194 y_offset, dst, dst_stride, 129 64, &sse1);
195 64, &sse);
196 // processing the next 32 elements in parallel
197 unsigned int sse2; 130 unsigned int sse2;
198 int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, 131 const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
199 x_offset, y_offset, 132 x_offset, y_offset,
200 dst + 32, dst_stride, 133 dst + 32, dst_stride,
201 64, &sse2); 134 64, &sse2);
202 se += se2; 135 const int se = se1 + se2;
203 sse += sse2; 136 *sse = sse1 + sse2;
204 *sse_ptr = sse; 137 return *sse - (((int64_t)se * se) >> 12);
205 return sse - (((int64_t)se * se) >> 12);
206 } 138 }
207 139
208 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, 140 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
209 int src_stride, 141 int src_stride,
210 int x_offset, 142 int x_offset,
211 int y_offset, 143 int y_offset,
212 const uint8_t *dst, 144 const uint8_t *dst,
213 int dst_stride, 145 int dst_stride,
214 unsigned int *sse_ptr) { 146 unsigned int *sse) {
215 // processing 32 element in parallel 147 const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
216 unsigned int sse; 148 y_offset, dst, dst_stride,
217 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 149 32, sse);
218 y_offset, dst, dst_stride, 150 return *sse - (((int64_t)se * se) >> 10);
219 32, &sse);
220 *sse_ptr = sse;
221 return sse - (((int64_t)se * se) >> 10);
222 } 151 }
223 152
224 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, 153 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
225 int src_stride, 154 int src_stride,
226 int x_offset, 155 int x_offset,
227 int y_offset, 156 int y_offset,
228 const uint8_t *dst, 157 const uint8_t *dst,
229 int dst_stride, 158 int dst_stride,
230 unsigned int *sseptr, 159 unsigned int *sse,
231 const uint8_t *sec) { 160 const uint8_t *sec) {
232 // processing 32 elements in parallel 161 unsigned int sse1;
233 unsigned int sse; 162 const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
163 y_offset, dst, dst_stride,
164 sec, 64, 64, &sse1);
165 unsigned int sse2;
166 const int se2 =
167 vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
168 y_offset, dst + 32, dst_stride,
169 sec + 32, 64, 64, &sse2);
170 const int se = se1 + se2;
234 171
235 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 172 *sse = sse1 + sse2;
236 y_offset, dst, dst_stride,
237 sec, 64, 64, &sse);
238 unsigned int sse2;
239 // processing the next 32 elements in parallel
240 int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
241 y_offset, dst + 32, dst_stride,
242 sec + 32, 64, 64, &sse2);
243 se += se2;
244 sse += sse2;
245 *sseptr = sse;
246 173
247 return sse - (((int64_t)se * se) >> 12); 174 return *sse - (((int64_t)se * se) >> 12);
248 } 175 }
249 176
250 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, 177 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
251 int src_stride, 178 int src_stride,
252 int x_offset, 179 int x_offset,
253 int y_offset, 180 int y_offset,
254 const uint8_t *dst, 181 const uint8_t *dst,
255 int dst_stride, 182 int dst_stride,
256 unsigned int *sseptr, 183 unsigned int *sse,
257 const uint8_t *sec) { 184 const uint8_t *sec) {
258 // processing 32 element in parallel 185 // processing 32 element in parallel
259 unsigned int sse; 186 const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
260 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 187 y_offset, dst, dst_stride,
261 y_offset, dst, dst_stride, 188 sec, 32, 32, sse);
262 sec, 32, 32, &sse); 189 return *sse - (((int64_t)se * se) >> 10);
263 *sseptr = sse;
264 return sse - (((int64_t)se * se) >> 10);
265 } 190 }
266
267
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698