source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c - Issue 1124333011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: only update to last nights LKGR Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 14 matching lines...) Expand all Loading...
25	25

26 unsigned int vp8_sub_pixel_variance16x16_neon_func(	26 unsigned int vp8_sub_pixel_variance16x16_neon_func(

27 const unsigned char *src_ptr,	27 const unsigned char *src_ptr,

28 int src_pixels_per_line,	28 int src_pixels_per_line,

29 int xoffset,	29 int xoffset,

30 int yoffset,	30 int yoffset,

31 const unsigned char *dst_ptr,	31 const unsigned char *dst_ptr,

32 int dst_pixels_per_line,	32 int dst_pixels_per_line,

33 unsigned int *sse) {	33 unsigned int *sse) {

34 int i;	34 int i;

35 DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);	35 DECLARE_ALIGNED(16, unsigned char, tmp[528]);

36 unsigned char *tmpp;	36 unsigned char *tmpp;

37 unsigned char *tmpp2;	37 unsigned char *tmpp2;

38 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;	38 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;

39 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;	39 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;

40 uint8x8_t d19u8, d20u8, d21u8;	40 uint8x8_t d19u8, d20u8, d21u8;

41 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;	41 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;

42 uint32x2_t d0u32, d10u32;	42 uint32x2_t d0u32, d10u32;

43 int64x1_t d0s64, d1s64, d2s64, d3s64;	43 int64x1_t d0s64, d1s64, d2s64, d3s64;

44 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;	44 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;

45 uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;	45 uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;

(...skipping 858 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
904 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),	904 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),

905 vreinterpret_s32_s64(d0s64));	905 vreinterpret_s32_s64(d0s64));

906 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);	906 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

907	907

908 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);	908 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);

909 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);	909 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

910	910

911 return vget_lane_u32(d0u32, 0);	911 return vget_lane_u32(d0u32, 0);

912 }	912 }

913	913

914 enum { kWidth8 = 8 };

915 enum { kHeight8 = 8 };

916 enum { kHeight8PlusOne = 9 };

917 enum { kPixelStepOne = 1 };

918 enum { kAlign16 = 16 };

919

920 #define FILTER_BITS 7	914 #define FILTER_BITS 7

921	915

922 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {	916 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {

923 const int32x4_t a = vpaddlq_s16(v_16x8);	917 const int32x4_t a = vpaddlq_s16(v_16x8);

924 const int64x2_t b = vpaddlq_s32(a);	918 const int64x2_t b = vpaddlq_s32(a);

925 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),	919 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

926 vreinterpret_s32_s64(vget_high_s64(b)));	920 vreinterpret_s32_s64(vget_high_s64(b)));

927 return vget_lane_s32(c, 0);	921 return vget_lane_s32(c, 0);

928 }	922 }

929	923

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
961 }	955 }

962	956

963 *sum = horizontal_add_s16x8(v_sum);	957 *sum = horizontal_add_s16x8(v_sum);

964 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));	958 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));

965 }	959 }

966	960

967 static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,	961 static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,

968 const uint8_t *b, int b_stride,	962 const uint8_t *b, int b_stride,

969 unsigned int *sse) {	963 unsigned int *sse) {

970 int sum;	964 int sum;

971 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);	965 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);

972 return sse - (((int64_t)sum sum) / (kWidth8 * kHeight8));	966 return sse - (((int64_t)sum sum) / (8 * 8));

973 }	967 }

974	968

975 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,	969 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,

976 uint8_t *output_ptr,	970 uint8_t *output_ptr,

977 unsigned int src_pixels_per_line,	971 unsigned int src_pixels_per_line,

978 int pixel_step,	972 int pixel_step,

979 unsigned int output_height,	973 unsigned int output_height,

980 unsigned int output_width,	974 unsigned int output_width,

981 const uint16_t *vpx_filter) {	975 const uint16_t *vpx_filter) {

982 const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);	976 const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);

(...skipping 13 matching lines...) Expand all Loading...
996 }	990 }

997	991

998 unsigned int vp8_sub_pixel_variance8x8_neon(	992 unsigned int vp8_sub_pixel_variance8x8_neon(

999 const unsigned char *src,	993 const unsigned char *src,

1000 int src_stride,	994 int src_stride,

1001 int xoffset,	995 int xoffset,

1002 int yoffset,	996 int yoffset,

1003 const unsigned char *dst,	997 const unsigned char *dst,

1004 int dst_stride,	998 int dst_stride,

1005 unsigned int *sse) {	999 unsigned int *sse) {

1006 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8PlusOne * kWidth8);	1000 DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);

1007 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);	1001 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

1008 if (xoffset == 0) {	1002 if (xoffset == 0) {

1009 var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,	1003 var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,

1010 kWidth8, bilinear_taps_coeff[yoffset]);	1004 8, bilinear_taps_coeff[yoffset]);

1011 } else if (yoffset == 0) {	1005 } else if (yoffset == 0) {

1012 var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,	1006 var_filter_block2d_bil_w8(src, temp2, src_stride, 1,

1013 kHeight8PlusOne, kWidth8,	1007 9, 8,

1014 bilinear_taps_coeff[xoffset]);	1008 bilinear_taps_coeff[xoffset]);

1015 } else {	1009 } else {

1016 var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,	1010 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,

1017 kHeight8PlusOne, kWidth8,	1011 9, 8,

1018 bilinear_taps_coeff[xoffset]);	1012 bilinear_taps_coeff[xoffset]);

1019 var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,	1013 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,

1020 kWidth8, bilinear_taps_coeff[yoffset]);	1014 8, bilinear_taps_coeff[yoffset]);

1021 }	1015 }

1022 return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);	1016 return variance8x8_neon(temp2, 8, dst, dst_stride, sse);

1023 }	1017 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/common/arm/neon/sad_neon.c ('k') | source/libvpx/vp8/common/common.h » ('j') | no next file with comments »