OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include <assert.h> |
12 | 13 |
13 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
14 #include "./vpx_dsp_rtcd.h" | 15 #include "./vpx_dsp_rtcd.h" |
15 #include "vpx/vpx_integer.h" | 16 #include "vpx/vpx_integer.h" |
16 #include "vpx_ports/mem.h" | 17 #include "vpx_ports/mem.h" |
17 | 18 |
18 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |
19 uint8_t *dst, ptrdiff_t dst_stride, | |
20 const int16_t *filter_x, int x_step_q4, | |
21 const int16_t *filter_y, int y_step_q4, | |
22 int w, int h); | |
23 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |
24 uint8_t *dst, ptrdiff_t dst_stride, | |
25 const int16_t *filter_x, int x_step_q4, | |
26 const int16_t *filter_y, int y_step_q4, | |
27 int w, int h); | |
28 | |
29 static INLINE int32x4_t MULTIPLY_BY_Q0( | 19 static INLINE int32x4_t MULTIPLY_BY_Q0( |
30 int16x4_t dsrc0, | 20 int16x4_t dsrc0, |
31 int16x4_t dsrc1, | 21 int16x4_t dsrc1, |
32 int16x4_t dsrc2, | 22 int16x4_t dsrc2, |
33 int16x4_t dsrc3, | 23 int16x4_t dsrc3, |
34 int16x4_t dsrc4, | 24 int16x4_t dsrc4, |
35 int16x4_t dsrc5, | 25 int16x4_t dsrc5, |
36 int16x4_t dsrc6, | 26 int16x4_t dsrc6, |
37 int16x4_t dsrc7, | 27 int16x4_t dsrc7, |
38 int16x8_t q0s16) { | 28 int16x8_t q0s16) { |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
75 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; | 65 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; |
76 int16x8_t q0s16; | 66 int16x8_t q0s16; |
77 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 67 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
78 int32x4_t q1s32, q2s32, q14s32, q15s32; | 68 int32x4_t q1s32, q2s32, q14s32, q15s32; |
79 uint16x8x2_t q0x2u16; | 69 uint16x8x2_t q0x2u16; |
80 uint8x8x2_t d0x2u8, d1x2u8; | 70 uint8x8x2_t d0x2u8, d1x2u8; |
81 uint32x2x2_t d0x2u32; | 71 uint32x2x2_t d0x2u32; |
82 uint16x4x2_t d0x2u16, d1x2u16; | 72 uint16x4x2_t d0x2u16, d1x2u16; |
83 uint32x4x2_t q0x2u32; | 73 uint32x4x2_t q0x2u32; |
84 | 74 |
85 if (x_step_q4 != 16) { | 75 assert(x_step_q4 == 16); |
86 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | |
87 filter_x, x_step_q4, | |
88 filter_y, y_step_q4, w, h); | |
89 return; | |
90 } | |
91 | 76 |
92 q0s16 = vld1q_s16(filter_x); | 77 q0s16 = vld1q_s16(filter_x); |
93 | 78 |
94 src -= 3; // adjust for taps | 79 src -= 3; // adjust for taps |
95 for (; h > 0; h -= 4, | 80 for (; h > 0; h -= 4, |
96 src += src_stride * 4, | 81 src += src_stride * 4, |
97 dst += dst_stride * 4) { // loop_horiz_v | 82 dst += dst_stride * 4) { // loop_horiz_v |
98 s = src; | 83 s = src; |
99 d24u8 = vld1_u8(s); | 84 d24u8 = vld1_u8(s); |
100 s += src_stride; | 85 s += src_stride; |
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
248 uint8_t *d; | 233 uint8_t *d; |
249 uint32x2_t d2u32, d3u32; | 234 uint32x2_t d2u32, d3u32; |
250 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; | 235 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; |
251 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; | 236 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; |
252 int16x4_t d24s16, d25s16, d26s16, d27s16; | 237 int16x4_t d24s16, d25s16, d26s16, d27s16; |
253 uint16x4_t d2u16, d3u16, d4u16, d5u16; | 238 uint16x4_t d2u16, d3u16, d4u16, d5u16; |
254 int16x8_t q0s16; | 239 int16x8_t q0s16; |
255 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 240 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
256 int32x4_t q1s32, q2s32, q14s32, q15s32; | 241 int32x4_t q1s32, q2s32, q14s32, q15s32; |
257 | 242 |
258 if (y_step_q4 != 16) { | 243 assert(y_step_q4 == 16); |
259 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | |
260 filter_x, x_step_q4, | |
261 filter_y, y_step_q4, w, h); | |
262 return; | |
263 } | |
264 | 244 |
265 src -= src_stride * 3; | 245 src -= src_stride * 3; |
266 q0s16 = vld1q_s16(filter_y); | 246 q0s16 = vld1q_s16(filter_y); |
267 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h | 247 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h |
268 s = src; | 248 s = src; |
269 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); | 249 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); |
270 s += src_stride; | 250 s += src_stride; |
271 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); | 251 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); |
272 s += src_stride; | 252 s += src_stride; |
273 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); | 253 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
351 | 331 |
352 q8u16 = q10u16; | 332 q8u16 = q10u16; |
353 d18s16 = d22s16; | 333 d18s16 = d22s16; |
354 d19s16 = d24s16; | 334 d19s16 = d24s16; |
355 q10u16 = q13u16; | 335 q10u16 = q13u16; |
356 d22s16 = d25s16; | 336 d22s16 = d25s16; |
357 } | 337 } |
358 } | 338 } |
359 return; | 339 return; |
360 } | 340 } |
OLD | NEW |