| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include "./vp9_rtcd.h" | 12 #include "./vp9_rtcd.h" |
| 13 #include "./vpx_dsp_rtcd.h" | 13 #include "./vpx_dsp_rtcd.h" |
| 14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 15 | 15 |
| 16 #include "vpx_ports/mem.h" | 16 #include "vpx_ports/mem.h" |
| 17 #include "vpx/vpx_integer.h" | 17 #include "vpx/vpx_integer.h" |
| 18 | 18 |
| 19 #include "vp9/common/vp9_common.h" | |
| 20 #include "vp9/common/vp9_filter.h" | 19 #include "vp9/common/vp9_filter.h" |
| 21 | 20 |
| 22 #include "vp9/encoder/vp9_variance.h" | 21 static uint8_t bilinear_filters[8][2] = { |
| 22 { 128, 0, }, |
| 23 { 112, 16, }, |
| 24 { 96, 32, }, |
| 25 { 80, 48, }, |
| 26 { 64, 64, }, |
| 27 { 48, 80, }, |
| 28 { 32, 96, }, |
| 29 { 16, 112, }, |
| 30 }; |
| 23 | 31 |
| 24 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, | 32 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, |
| 25 uint8_t *output_ptr, | 33 uint8_t *output_ptr, |
| 26 unsigned int src_pixels_per_line, | 34 unsigned int src_pixels_per_line, |
| 27 int pixel_step, | 35 int pixel_step, |
| 28 unsigned int output_height, | 36 unsigned int output_height, |
| 29 unsigned int output_width, | 37 unsigned int output_width, |
| 30 const int16_t *vp9_filter) { | 38 const uint8_t *vp9_filter) { |
| 31 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); | 39 const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); |
| 32 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); | 40 const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); |
| 33 unsigned int i; | 41 unsigned int i; |
| 34 for (i = 0; i < output_height; ++i) { | 42 for (i = 0; i < output_height; ++i) { |
| 35 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); | 43 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); |
| 36 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); | 44 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); |
| 37 const uint16x8_t a = vmull_u8(src_0, f0); | 45 const uint16x8_t a = vmull_u8(src_0, f0); |
| 38 const uint16x8_t b = vmlal_u8(a, src_1, f1); | 46 const uint16x8_t b = vmlal_u8(a, src_1, f1); |
| 39 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); | 47 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); |
| 40 vst1_u8(&output_ptr[0], out); | 48 vst1_u8(&output_ptr[0], out); |
| 41 // Next row... | 49 // Next row... |
| 42 src_ptr += src_pixels_per_line; | 50 src_ptr += src_pixels_per_line; |
| 43 output_ptr += output_width; | 51 output_ptr += output_width; |
| 44 } | 52 } |
| 45 } | 53 } |
| 46 | 54 |
| 47 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, | 55 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, |
| 48 uint8_t *output_ptr, | 56 uint8_t *output_ptr, |
| 49 unsigned int src_pixels_per_line, | 57 unsigned int src_pixels_per_line, |
| 50 int pixel_step, | 58 int pixel_step, |
| 51 unsigned int output_height, | 59 unsigned int output_height, |
| 52 unsigned int output_width, | 60 unsigned int output_width, |
| 53 const int16_t *vp9_filter) { | 61 const uint8_t *vp9_filter) { |
| 54 const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); | 62 const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); |
| 55 const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); | 63 const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); |
| 56 unsigned int i, j; | 64 unsigned int i, j; |
| 57 for (i = 0; i < output_height; ++i) { | 65 for (i = 0; i < output_height; ++i) { |
| 58 for (j = 0; j < output_width; j += 16) { | 66 for (j = 0; j < output_width; j += 16) { |
| 59 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); | 67 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); |
| 60 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); | 68 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); |
| 61 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); | 69 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); |
| 62 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); | 70 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); |
| 63 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); | 71 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); |
| 64 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); | 72 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); |
| 65 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); | 73 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); |
| (...skipping 11 matching lines...) Expand all Loading... |
| 77 int xoffset, | 85 int xoffset, |
| 78 int yoffset, | 86 int yoffset, |
| 79 const uint8_t *dst, | 87 const uint8_t *dst, |
| 80 int dst_stride, | 88 int dst_stride, |
| 81 unsigned int *sse) { | 89 unsigned int *sse) { |
| 82 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); | 90 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); |
| 83 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); | 91 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); |
| 84 | 92 |
| 85 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, | 93 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, |
| 86 9, 8, | 94 9, 8, |
| 87 BILINEAR_FILTERS_2TAP(xoffset)); | 95 bilinear_filters[xoffset]); |
| 88 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, | 96 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, |
| 89 8, BILINEAR_FILTERS_2TAP(yoffset)); | 97 8, bilinear_filters[yoffset]); |
| 90 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); | 98 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); |
| 91 } | 99 } |
| 92 | 100 |
| 93 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, | 101 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, |
| 94 int src_stride, | 102 int src_stride, |
| 95 int xoffset, | 103 int xoffset, |
| 96 int yoffset, | 104 int yoffset, |
| 97 const uint8_t *dst, | 105 const uint8_t *dst, |
| 98 int dst_stride, | 106 int dst_stride, |
| 99 unsigned int *sse) { | 107 unsigned int *sse) { |
| 100 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); | 108 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); |
| 101 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); | 109 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); |
| 102 | 110 |
| 103 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, | 111 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, |
| 104 17, 16, | 112 17, 16, |
| 105 BILINEAR_FILTERS_2TAP(xoffset)); | 113 bilinear_filters[xoffset]); |
| 106 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, | 114 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, |
| 107 16, BILINEAR_FILTERS_2TAP(yoffset)); | 115 16, bilinear_filters[yoffset]); |
| 108 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); | 116 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); |
| 109 } | 117 } |
| 110 | 118 |
| 111 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, | 119 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, |
| 112 int src_stride, | 120 int src_stride, |
| 113 int xoffset, | 121 int xoffset, |
| 114 int yoffset, | 122 int yoffset, |
| 115 const uint8_t *dst, | 123 const uint8_t *dst, |
| 116 int dst_stride, | 124 int dst_stride, |
| 117 unsigned int *sse) { | 125 unsigned int *sse) { |
| 118 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); | 126 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); |
| 119 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); | 127 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); |
| 120 | 128 |
| 121 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, | 129 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, |
| 122 33, 32, | 130 33, 32, |
| 123 BILINEAR_FILTERS_2TAP(xoffset)); | 131 bilinear_filters[xoffset]); |
| 124 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, | 132 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, |
| 125 32, BILINEAR_FILTERS_2TAP(yoffset)); | 133 32, bilinear_filters[yoffset]); |
| 126 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); | 134 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); |
| 127 } | 135 } |
| 128 | 136 |
| 129 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, | 137 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, |
| 130 int src_stride, | 138 int src_stride, |
| 131 int xoffset, | 139 int xoffset, |
| 132 int yoffset, | 140 int yoffset, |
| 133 const uint8_t *dst, | 141 const uint8_t *dst, |
| 134 int dst_stride, | 142 int dst_stride, |
| 135 unsigned int *sse) { | 143 unsigned int *sse) { |
| 136 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); | 144 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); |
| 137 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); | 145 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); |
| 138 | 146 |
| 139 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, | 147 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, |
| 140 65, 64, | 148 65, 64, |
| 141 BILINEAR_FILTERS_2TAP(xoffset)); | 149 bilinear_filters[xoffset]); |
| 142 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, | 150 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, |
| 143 64, BILINEAR_FILTERS_2TAP(yoffset)); | 151 64, bilinear_filters[yoffset]); |
| 144 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); | 152 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); |
| 145 } | 153 } |
| OLD | NEW |