OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include <assert.h> |
| 13 |
12 #include "./vp9_rtcd.h" | 14 #include "./vp9_rtcd.h" |
13 #include "./vpx_config.h" | 15 #include "./vpx_config.h" |
14 | 16 |
15 #include "vpx/vpx_integer.h" | 17 #include "vpx/vpx_integer.h" |
16 | 18 |
17 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { | 19 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { |
18 const uint32x4_t a = vpaddlq_u16(v_16x8); | 20 const uint32x4_t a = vpaddlq_u16(v_16x8); |
19 const uint64x2_t b = vpaddlq_u32(a); | 21 const uint64x2_t b = vpaddlq_u32(a); |
20 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 22 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
21 vreinterpret_u32_u64(vget_high_u64(b))); | 23 vreinterpret_u32_u64(vget_high_u64(b))); |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
107 | 109 |
108 for (i = 0; i < width; i += 16) { | 110 for (i = 0; i < width; i += 16) { |
109 const uint8x16_t vec_row = vld1q_u8(ref); | 111 const uint8x16_t vec_row = vld1q_u8(ref); |
110 vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); | 112 vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); |
111 vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); | 113 vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); |
112 ref += 16; | 114 ref += 16; |
113 } | 115 } |
114 | 116 |
115 return horizontal_add_u16x8(vec_sum); | 117 return horizontal_add_u16x8(vec_sum); |
116 } | 118 } |
| 119 |
| 120 // ref, src = [0, 510] - max diff = 16-bits |
| 121 // bwl = {2, 3, 4}, width = {16, 32, 64} |
| 122 int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { |
| 123 int width = 4 << bwl; |
| 124 int32x4_t sse = vdupq_n_s32(0); |
| 125 int16x8_t total = vdupq_n_s16(0); |
| 126 |
| 127 assert(width >= 8); |
| 128 assert((width % 8) == 0); |
| 129 |
| 130 do { |
| 131 const int16x8_t r = vld1q_s16(ref); |
| 132 const int16x8_t s = vld1q_s16(src); |
| 133 const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. |
| 134 const int16x4_t diff_lo = vget_low_s16(diff); |
| 135 const int16x4_t diff_hi = vget_high_s16(diff); |
| 136 sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. |
| 137 sse = vmlal_s16(sse, diff_hi, diff_hi); |
| 138 total = vaddq_s16(total, diff); // dynamic range 16 bits. |
| 139 |
| 140 ref += 8; |
| 141 src += 8; |
| 142 width -= 8; |
| 143 } while (width != 0); |
| 144 |
| 145 { |
| 146 // Note: 'total''s pairwise addition could be implemented similarly to |
| 147 // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired |
| 148 // with the summation of 'sse' performed better on a Cortex-A15. |
| 149 const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' |
| 150 const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); |
| 151 const int32x2_t t2 = vpadd_s32(t1, t1); |
| 152 const int t = vget_lane_s32(t2, 0); |
| 153 const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. |
| 154 const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), |
| 155 vreinterpret_s32_s64(vget_high_s64(s0))); |
| 156 const int s = vget_lane_s32(s1, 0); |
| 157 const int shift_factor = bwl + 2; |
| 158 return s - ((t * t) >> shift_factor); |
| 159 } |
| 160 } |
OLD | NEW |