OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <arm_neon.h> | |
12 #include "./vp9_rtcd.h" | |
13 #include "./vpx_config.h" | |
14 | |
15 #include "vpx/vpx_integer.h" | |
16 | |
17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, | |
18 const uint16x8_t vec_hi) { | |
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), | |
20 vget_high_u16(vec_lo)); | |
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), | |
22 vget_high_u16(vec_hi)); | |
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); | |
24 const uint64x2_t b = vpaddlq_u32(a); | |
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | |
26 vreinterpret_u32_u64(vget_high_u64(b))); | |
27 return vget_lane_u32(c, 0); | |
28 } | |
29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { | |
30 const uint32x4_t a = vpaddlq_u16(vec_16x8); | |
31 const uint64x2_t b = vpaddlq_u32(a); | |
32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | |
33 vreinterpret_u32_u64(vget_high_u64(b))); | |
34 return vget_lane_u32(c, 0); | |
35 } | |
36 | |
37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, | |
38 const uint8_t *ref, int ref_stride) { | |
39 int i; | |
40 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | |
41 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | |
42 for (i = 0; i < 64; ++i) { | |
43 const uint8x16_t vec_src_00 = vld1q_u8(src); | |
44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | |
45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); | |
46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); | |
47 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | |
48 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); | |
49 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); | |
50 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); | |
51 src += src_stride; | |
52 ref += ref_stride; | |
53 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), | |
54 vget_low_u8(vec_ref_00)); | |
55 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), | |
56 vget_high_u8(vec_ref_00)); | |
57 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), | |
58 vget_low_u8(vec_ref_16)); | |
59 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), | |
60 vget_high_u8(vec_ref_16)); | |
61 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), | |
62 vget_low_u8(vec_ref_32)); | |
63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), | |
64 vget_high_u8(vec_ref_32)); | |
65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), | |
66 vget_low_u8(vec_ref_48)); | |
67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), | |
68 vget_high_u8(vec_ref_48)); | |
69 } | |
70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); | |
71 } | |
72 | |
73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, | |
74 const uint8_t *ref, int ref_stride) { | |
75 int i; | |
76 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | |
77 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | |
78 | |
79 for (i = 0; i < 32; ++i) { | |
80 const uint8x16_t vec_src_00 = vld1q_u8(src); | |
81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | |
82 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | |
83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); | |
84 src += src_stride; | |
85 ref += ref_stride; | |
86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), | |
87 vget_low_u8(vec_ref_00)); | |
88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), | |
89 vget_high_u8(vec_ref_00)); | |
90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), | |
91 vget_low_u8(vec_ref_16)); | |
92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), | |
93 vget_high_u8(vec_ref_16)); | |
94 } | |
95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); | |
96 } | |
97 | |
98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, | |
99 const uint8_t *ref, int ref_stride) { | |
100 int i; | |
101 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | |
102 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | |
103 | |
104 for (i = 0; i < 16; ++i) { | |
105 const uint8x16_t vec_src = vld1q_u8(src); | |
106 const uint8x16_t vec_ref = vld1q_u8(ref); | |
107 src += src_stride; | |
108 ref += ref_stride; | |
109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), | |
110 vget_low_u8(vec_ref)); | |
111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), | |
112 vget_high_u8(vec_ref)); | |
113 } | |
114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); | |
115 } | |
116 | |
117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, | |
118 const uint8_t *ref, int ref_stride) { | |
119 int i; | |
120 uint16x8_t vec_accum = vdupq_n_u16(0); | |
121 | |
122 for (i = 0; i < 8; ++i) { | |
123 const uint8x8_t vec_src = vld1_u8(src); | |
124 const uint8x8_t vec_ref = vld1_u8(ref); | |
125 src += src_stride; | |
126 ref += ref_stride; | |
127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); | |
128 } | |
129 return horizontal_add_16x8(vec_accum); | |
130 } | |
OLD | NEW |