Index: source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c (revision 291857) |
+++ source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c (working copy) |
@@ -12,67 +12,39 @@ |
#include "vp9/encoder/vp9_variance.h" |
#include "vpx_ports/mem.h" |
-typedef void (*get_var_avx2) ( |
- const unsigned char *src_ptr, |
- int source_stride, |
- const unsigned char *ref_ptr, |
- int recon_stride, |
- unsigned int *SSE, |
- int *Sum |
-); |
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
+ unsigned int *sse, int *sum); |
-void vp9_get16x16var_avx2 |
-( |
- const unsigned char *src_ptr, |
- int source_stride, |
- const unsigned char *ref_ptr, |
- int recon_stride, |
- unsigned int *SSE, |
- int *Sum |
-); |
+void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
+ unsigned int *sse, int *sum); |
-void vp9_get32x32var_avx2 |
-( |
- const unsigned char *src_ptr, |
- int source_stride, |
- const unsigned char *ref_ptr, |
- int recon_stride, |
- unsigned int *SSE, |
- int *Sum |
-); |
+void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
+ unsigned int *sse, int *sum); |
-unsigned int vp9_sub_pixel_variance32xh_avx2 |
-( |
- const uint8_t *src, |
- int src_stride, |
- int x_offset, |
- int y_offset, |
- const uint8_t *dst, |
- int dst_stride, |
- int height, |
- unsigned int *sse |
-); |
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, |
+ int x_offset, int y_offset, |
+ const uint8_t *dst, int dst_stride, |
+ int height, |
+ unsigned int *sse); |
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2 |
-( |
- const uint8_t *src, |
- int src_stride, |
- int x_offset, |
- int y_offset, |
- const uint8_t *dst, |
- int dst_stride, |
- const uint8_t *sec, |
- int sec_stride, |
- int height, |
- unsigned int *sseptr |
-); |
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, |
+ int src_stride, |
+ int x_offset, |
+ int y_offset, |
+ const uint8_t *dst, |
+ int dst_stride, |
+ const uint8_t *sec, |
+ int sec_stride, |
+ int height, |
+ unsigned int *sseptr); |
-static void variance_avx2(const unsigned char *src_ptr, int source_stride, |
- const unsigned char *ref_ptr, int recon_stride, |
- int w, int h, unsigned int *sse, int *sum, |
- get_var_avx2 var_fn, int block_size) { |
- unsigned int sse0; |
- int sum0; |
+static void variance_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
+ int w, int h, unsigned int *sse, int *sum, |
+ get_var_avx2 var_fn, int block_size) { |
int i, j; |
*sse = 0; |
@@ -80,105 +52,68 @@ |
for (i = 0; i < h; i += 16) { |
for (j = 0; j < w; j += block_size) { |
- // processing 16 rows horizontally each call |
- var_fn(src_ptr + source_stride * i + j, source_stride, |
- ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); |
+ unsigned int sse0; |
+ int sum0; |
+ var_fn(&src[src_stride * i + j], src_stride, |
+ &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); |
*sse += sse0; |
*sum += sum0; |
} |
} |
} |
-unsigned int vp9_variance16x16_avx2 |
-( |
- const unsigned char *src_ptr, |
- int source_stride, |
- const unsigned char *ref_ptr, |
- int recon_stride, |
- unsigned int *sse) { |
- unsigned int var; |
- int avg; |
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, |
- &var, &avg, vp9_get16x16var_avx2, 16); |
- *sse = var; |
- return (var - (((unsigned int)avg * avg) >> 8)); |
+unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
+ unsigned int *sse) { |
+ int sum; |
+ variance_avx2(src, src_stride, ref, ref_stride, 16, 16, |
+ sse, &sum, vp9_get16x16var_avx2, 16); |
+ return *sse - (((unsigned int)sum * sum) >> 8); |
} |
-unsigned int vp9_mse16x16_avx2( |
- const unsigned char *src_ptr, |
- int source_stride, |
- const unsigned char *ref_ptr, |
- int recon_stride, |
- unsigned int *sse) { |
- unsigned int sse0; |
- int sum0; |
- vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, |
- &sum0); |
- *sse = sse0; |
- return sse0; |
+unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
+ unsigned int *sse) { |
+ int sum; |
+ vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); |
+ return *sse; |
} |
-unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, |
- int source_stride, |
- const uint8_t *ref_ptr, |
- int recon_stride, |
+unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
unsigned int *sse) { |
- unsigned int var; |
- int avg; |
- |
- // processing 32 elements vertically in parallel |
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, |
- &var, &avg, vp9_get32x32var_avx2, 32); |
- *sse = var; |
- return (var - (((int64_t)avg * avg) >> 10)); |
+ int sum; |
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 16, |
+ sse, &sum, vp9_get32x32var_avx2, 32); |
+ return *sse - (((int64_t)sum * sum) >> 9); |
} |
-unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, |
- int source_stride, |
- const uint8_t *ref_ptr, |
- int recon_stride, |
+unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
unsigned int *sse) { |
- unsigned int var; |
- int avg; |
- |
- // processing 32 elements vertically in parallel |
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, |
- &var, &avg, vp9_get32x32var_avx2, 32); |
- *sse = var; |
- return (var - (((int64_t)avg * avg) >> 9)); |
+ int sum; |
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 32, |
+ sse, &sum, vp9_get32x32var_avx2, 32); |
+ return *sse - (((int64_t)sum * sum) >> 10); |
} |
- |
-unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, |
- int source_stride, |
- const uint8_t *ref_ptr, |
- int recon_stride, |
+unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
unsigned int *sse) { |
- unsigned int var; |
- int avg; |
- |
- // processing 32 elements vertically in parallel |
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, |
- &var, &avg, vp9_get32x32var_avx2, 32); |
- *sse = var; |
- return (var - (((int64_t)avg * avg) >> 12)); |
+ int sum; |
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 64, |
+ sse, &sum, vp9_get32x32var_avx2, 32); |
+ return *sse - (((int64_t)sum * sum) >> 12); |
} |
-unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, |
- int source_stride, |
- const uint8_t *ref_ptr, |
- int recon_stride, |
+unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, |
+ const uint8_t *ref, int ref_stride, |
unsigned int *sse) { |
- unsigned int var; |
- int avg; |
- |
- // processing 32 elements vertically in parallel |
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, |
- &var, &avg, vp9_get32x32var_avx2, 32); |
- |
- *sse = var; |
- return (var - (((int64_t)avg * avg) >> 11)); |
+ int sum; |
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 32, |
+ sse, &sum, vp9_get32x32var_avx2, 32); |
+ return *sse - (((int64_t)sum * sum) >> 11); |
} |
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, |
@@ -187,22 +122,19 @@ |
int y_offset, |
const uint8_t *dst, |
int dst_stride, |
- unsigned int *sse_ptr) { |
- // processing 32 elements in parallel |
- unsigned int sse; |
- int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, |
- y_offset, dst, dst_stride, |
- 64, &sse); |
- // processing the next 32 elements in parallel |
+ unsigned int *sse) { |
+ unsigned int sse1; |
+ const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, |
+ y_offset, dst, dst_stride, |
+ 64, &sse1); |
unsigned int sse2; |
- int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, |
- x_offset, y_offset, |
- dst + 32, dst_stride, |
- 64, &sse2); |
- se += se2; |
- sse += sse2; |
- *sse_ptr = sse; |
- return sse - (((int64_t)se * se) >> 12); |
+ const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, |
+ x_offset, y_offset, |
+ dst + 32, dst_stride, |
+ 64, &sse2); |
+ const int se = se1 + se2; |
+ *sse = sse1 + sse2; |
+ return *sse - (((int64_t)se * se) >> 12); |
} |
unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, |
@@ -211,14 +143,11 @@ |
int y_offset, |
const uint8_t *dst, |
int dst_stride, |
- unsigned int *sse_ptr) { |
- // processing 32 element in parallel |
- unsigned int sse; |
- int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, |
- y_offset, dst, dst_stride, |
- 32, &sse); |
- *sse_ptr = sse; |
- return sse - (((int64_t)se * se) >> 10); |
+ unsigned int *sse) { |
+ const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, |
+ y_offset, dst, dst_stride, |
+ 32, sse); |
+ return *sse - (((int64_t)se * se) >> 10); |
} |
unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, |
@@ -227,24 +156,22 @@ |
int y_offset, |
const uint8_t *dst, |
int dst_stride, |
- unsigned int *sseptr, |
+ unsigned int *sse, |
const uint8_t *sec) { |
- // processing 32 elements in parallel |
- unsigned int sse; |
- |
- int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, |
- y_offset, dst, dst_stride, |
- sec, 64, 64, &sse); |
+ unsigned int sse1; |
+ const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, |
+ y_offset, dst, dst_stride, |
+ sec, 64, 64, &sse1); |
unsigned int sse2; |
- // processing the next 32 elements in parallel |
- int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, |
- y_offset, dst + 32, dst_stride, |
- sec + 32, 64, 64, &sse2); |
- se += se2; |
- sse += sse2; |
- *sseptr = sse; |
+ const int se2 = |
+ vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, |
+ y_offset, dst + 32, dst_stride, |
+ sec + 32, 64, 64, &sse2); |
+ const int se = se1 + se2; |
- return sse - (((int64_t)se * se) >> 12); |
+ *sse = sse1 + sse2; |
+ |
+ return *sse - (((int64_t)se * se) >> 12); |
} |
unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, |
@@ -253,15 +180,11 @@ |
int y_offset, |
const uint8_t *dst, |
int dst_stride, |
- unsigned int *sseptr, |
+ unsigned int *sse, |
const uint8_t *sec) { |
// processing 32 element in parallel |
- unsigned int sse; |
- int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, |
- y_offset, dst, dst_stride, |
- sec, 32, 32, &sse); |
- *sseptr = sse; |
- return sse - (((int64_t)se * se) >> 10); |
+ const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, |
+ y_offset, dst, dst_stride, |
+ sec, 32, 32, sse); |
+ return *sse - (((int64_t)se * se) >> 10); |
} |
- |
- |