OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 | 12 |
13 static const uint16_t bifilter4_coeff[8][2] = { | 13 static const uint8_t bifilter4_coeff[8][2] = { |
14 {128, 0}, | 14 {128, 0}, |
15 {112, 16}, | 15 {112, 16}, |
16 { 96, 32}, | 16 { 96, 32}, |
17 { 80, 48}, | 17 { 80, 48}, |
18 { 64, 64}, | 18 { 64, 64}, |
19 { 48, 80}, | 19 { 48, 80}, |
20 { 32, 96}, | 20 { 32, 96}, |
21 { 16, 112} | 21 { 16, 112} |
22 }; | 22 }; |
23 | 23 |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
57 } else { | 57 } else { |
58 d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 58 d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
59 d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 59 d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
60 d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 60 d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
61 d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 61 d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
62 d6u8 = vld1_u8(src_ptr); | 62 d6u8 = vld1_u8(src_ptr); |
63 | 63 |
64 q1u8 = vcombine_u8(d2u8, d3u8); | 64 q1u8 = vcombine_u8(d2u8, d3u8); |
65 q2u8 = vcombine_u8(d4u8, d5u8); | 65 q2u8 = vcombine_u8(d4u8, d5u8); |
66 | 66 |
67 d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); | 67 d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); |
68 d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); | 68 d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); |
69 | 69 |
70 q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); | 70 q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); |
71 q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); | 71 q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); |
72 d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); | 72 d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); |
73 | 73 |
74 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), | 74 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), |
75 vreinterpret_u32_u8(vget_high_u8(q1u8))); | 75 vreinterpret_u32_u8(vget_high_u8(q1u8))); |
76 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), | 76 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), |
77 vreinterpret_u32_u8(vget_high_u8(q2u8))); | 77 vreinterpret_u32_u8(vget_high_u8(q2u8))); |
78 d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), | 78 d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
148 d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 148 d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
149 d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 149 d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
150 d26u8 = vld1_u8(src_ptr); | 150 d26u8 = vld1_u8(src_ptr); |
151 } else { | 151 } else { |
152 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 152 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
153 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 153 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
154 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 154 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
155 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 155 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
156 q5u8 = vld1q_u8(src_ptr); | 156 q5u8 = vld1q_u8(src_ptr); |
157 | 157 |
158 d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); | 158 d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); |
159 d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); | 159 d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); |
160 | 160 |
161 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); | 161 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); |
162 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); | 162 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); |
163 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); | 163 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); |
164 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); | 164 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); |
165 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); | 165 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); |
166 | 166 |
167 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); | 167 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); |
168 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); | 168 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); |
169 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); | 169 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
238 d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 238 d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
239 d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 239 d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
240 d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 240 d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
241 d30u8 = vld1_u8(src_ptr); | 241 d30u8 = vld1_u8(src_ptr); |
242 } else { | 242 } else { |
243 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 243 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
244 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 244 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
245 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 245 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
246 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 246 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
247 | 247 |
248 d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); | 248 d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); |
249 d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); | 249 d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); |
250 | 250 |
251 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); | 251 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); |
252 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); | 252 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); |
253 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); | 253 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); |
254 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); | 254 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); |
255 | 255 |
256 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); | 256 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); |
257 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); | 257 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); |
258 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); | 258 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); |
259 d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); | 259 d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); |
(...skipping 430 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
690 | 690 |
691 q11u8 = q15u8; | 691 q11u8 = q15u8; |
692 | 692 |
693 vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; | 693 vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; |
694 vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; | 694 vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; |
695 vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; | 695 vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; |
696 vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; | 696 vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; |
697 } | 697 } |
698 return; | 698 return; |
699 } | 699 } |
OLD | NEW |