OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 14 matching lines...) Expand all Loading... |
25 | 25 |
26 unsigned int vp8_sub_pixel_variance16x16_neon_func( | 26 unsigned int vp8_sub_pixel_variance16x16_neon_func( |
27 const unsigned char *src_ptr, | 27 const unsigned char *src_ptr, |
28 int src_pixels_per_line, | 28 int src_pixels_per_line, |
29 int xoffset, | 29 int xoffset, |
30 int yoffset, | 30 int yoffset, |
31 const unsigned char *dst_ptr, | 31 const unsigned char *dst_ptr, |
32 int dst_pixels_per_line, | 32 int dst_pixels_per_line, |
33 unsigned int *sse) { | 33 unsigned int *sse) { |
34 int i; | 34 int i; |
35 DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); | 35 DECLARE_ALIGNED(16, unsigned char, tmp[528]); |
36 unsigned char *tmpp; | 36 unsigned char *tmpp; |
37 unsigned char *tmpp2; | 37 unsigned char *tmpp2; |
38 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; | 38 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; |
39 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; | 39 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; |
40 uint8x8_t d19u8, d20u8, d21u8; | 40 uint8x8_t d19u8, d20u8, d21u8; |
41 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | 41 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; |
42 uint32x2_t d0u32, d10u32; | 42 uint32x2_t d0u32, d10u32; |
43 int64x1_t d0s64, d1s64, d2s64, d3s64; | 43 int64x1_t d0s64, d1s64, d2s64, d3s64; |
44 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; | 44 uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; |
45 uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; | 45 uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; |
(...skipping 858 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
904 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | 904 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), |
905 vreinterpret_s32_s64(d0s64)); | 905 vreinterpret_s32_s64(d0s64)); |
906 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | 906 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); |
907 | 907 |
908 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); | 908 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); |
909 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | 909 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); |
910 | 910 |
911 return vget_lane_u32(d0u32, 0); | 911 return vget_lane_u32(d0u32, 0); |
912 } | 912 } |
913 | 913 |
914 enum { kWidth8 = 8 }; | |
915 enum { kHeight8 = 8 }; | |
916 enum { kHeight8PlusOne = 9 }; | |
917 enum { kPixelStepOne = 1 }; | |
918 enum { kAlign16 = 16 }; | |
919 | |
920 #define FILTER_BITS 7 | 914 #define FILTER_BITS 7 |
921 | 915 |
922 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { | 916 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { |
923 const int32x4_t a = vpaddlq_s16(v_16x8); | 917 const int32x4_t a = vpaddlq_s16(v_16x8); |
924 const int64x2_t b = vpaddlq_s32(a); | 918 const int64x2_t b = vpaddlq_s32(a); |
925 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), | 919 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), |
926 vreinterpret_s32_s64(vget_high_s64(b))); | 920 vreinterpret_s32_s64(vget_high_s64(b))); |
927 return vget_lane_s32(c, 0); | 921 return vget_lane_s32(c, 0); |
928 } | 922 } |
929 | 923 |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
961 } | 955 } |
962 | 956 |
963 *sum = horizontal_add_s16x8(v_sum); | 957 *sum = horizontal_add_s16x8(v_sum); |
964 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); | 958 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); |
965 } | 959 } |
966 | 960 |
967 static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, | 961 static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, |
968 const uint8_t *b, int b_stride, | 962 const uint8_t *b, int b_stride, |
969 unsigned int *sse) { | 963 unsigned int *sse) { |
970 int sum; | 964 int sum; |
971 variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); | 965 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); |
972 return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); | 966 return *sse - (((int64_t)sum * sum) / (8 * 8)); |
973 } | 967 } |
974 | 968 |
975 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, | 969 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, |
976 uint8_t *output_ptr, | 970 uint8_t *output_ptr, |
977 unsigned int src_pixels_per_line, | 971 unsigned int src_pixels_per_line, |
978 int pixel_step, | 972 int pixel_step, |
979 unsigned int output_height, | 973 unsigned int output_height, |
980 unsigned int output_width, | 974 unsigned int output_width, |
981 const uint16_t *vpx_filter) { | 975 const uint16_t *vpx_filter) { |
982 const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); | 976 const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); |
(...skipping 13 matching lines...) Expand all Loading... |
996 } | 990 } |
997 | 991 |
998 unsigned int vp8_sub_pixel_variance8x8_neon( | 992 unsigned int vp8_sub_pixel_variance8x8_neon( |
999 const unsigned char *src, | 993 const unsigned char *src, |
1000 int src_stride, | 994 int src_stride, |
1001 int xoffset, | 995 int xoffset, |
1002 int yoffset, | 996 int yoffset, |
1003 const unsigned char *dst, | 997 const unsigned char *dst, |
1004 int dst_stride, | 998 int dst_stride, |
1005 unsigned int *sse) { | 999 unsigned int *sse) { |
1006 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8PlusOne * kWidth8); | 1000 DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]); |
1007 DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); | 1001 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); |
1008 if (xoffset == 0) { | 1002 if (xoffset == 0) { |
1009 var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8, | 1003 var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8, |
1010 kWidth8, bilinear_taps_coeff[yoffset]); | 1004 8, bilinear_taps_coeff[yoffset]); |
1011 } else if (yoffset == 0) { | 1005 } else if (yoffset == 0) { |
1012 var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne, | 1006 var_filter_block2d_bil_w8(src, temp2, src_stride, 1, |
1013 kHeight8PlusOne, kWidth8, | 1007 9, 8, |
1014 bilinear_taps_coeff[xoffset]); | 1008 bilinear_taps_coeff[xoffset]); |
1015 } else { | 1009 } else { |
1016 var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, | 1010 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, |
1017 kHeight8PlusOne, kWidth8, | 1011 9, 8, |
1018 bilinear_taps_coeff[xoffset]); | 1012 bilinear_taps_coeff[xoffset]); |
1019 var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, | 1013 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, |
1020 kWidth8, bilinear_taps_coeff[yoffset]); | 1014 8, bilinear_taps_coeff[yoffset]); |
1021 } | 1015 } |
1022 return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); | 1016 return variance8x8_neon(temp2, 8, dst, dst_stride, sse); |
1023 } | 1017 } |
OLD | NEW |