OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include <assert.h> |
12 | 13 |
13 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
14 #include "./vpx_dsp_rtcd.h" | 15 #include "./vpx_dsp_rtcd.h" |
15 #include "vpx/vpx_integer.h" | 16 #include "vpx/vpx_integer.h" |
16 #include "vpx_ports/mem.h" | 17 #include "vpx_ports/mem.h" |
17 | 18 |
18 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |
19 uint8_t *dst, ptrdiff_t dst_stride, | |
20 const int16_t *filter_x, int x_step_q4, | |
21 const int16_t *filter_y, int y_step_q4, | |
22 int w, int h); | |
23 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |
24 uint8_t *dst, ptrdiff_t dst_stride, | |
25 const int16_t *filter_x, int x_step_q4, | |
26 const int16_t *filter_y, int y_step_q4, | |
27 int w, int h); | |
28 | |
29 static INLINE int32x4_t MULTIPLY_BY_Q0( | 19 static INLINE int32x4_t MULTIPLY_BY_Q0( |
30 int16x4_t dsrc0, | 20 int16x4_t dsrc0, |
31 int16x4_t dsrc1, | 21 int16x4_t dsrc1, |
32 int16x4_t dsrc2, | 22 int16x4_t dsrc2, |
33 int16x4_t dsrc3, | 23 int16x4_t dsrc3, |
34 int16x4_t dsrc4, | 24 int16x4_t dsrc4, |
35 int16x4_t dsrc5, | 25 int16x4_t dsrc5, |
36 int16x4_t dsrc6, | 26 int16x4_t dsrc6, |
37 int16x4_t dsrc7, | 27 int16x4_t dsrc7, |
38 int16x8_t q0s16) { | 28 int16x8_t q0s16) { |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
75 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; | 65 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; |
76 int16x8_t q0s16; | 66 int16x8_t q0s16; |
77 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 67 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
78 int32x4_t q1s32, q2s32, q14s32, q15s32; | 68 int32x4_t q1s32, q2s32, q14s32, q15s32; |
79 uint16x8x2_t q0x2u16; | 69 uint16x8x2_t q0x2u16; |
80 uint8x8x2_t d0x2u8, d1x2u8; | 70 uint8x8x2_t d0x2u8, d1x2u8; |
81 uint32x2x2_t d0x2u32; | 71 uint32x2x2_t d0x2u32; |
82 uint16x4x2_t d0x2u16, d1x2u16; | 72 uint16x4x2_t d0x2u16, d1x2u16; |
83 uint32x4x2_t q0x2u32; | 73 uint32x4x2_t q0x2u32; |
84 | 74 |
85 if (x_step_q4 != 16) { | 75 assert(x_step_q4 == 16); |
86 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | |
87 filter_x, x_step_q4, | |
88 filter_y, y_step_q4, w, h); | |
89 return; | |
90 } | |
91 | 76 |
92 q0s16 = vld1q_s16(filter_x); | 77 q0s16 = vld1q_s16(filter_x); |
93 | 78 |
94 src -= 3; // adjust for taps | 79 src -= 3; // adjust for taps |
95 for (; h > 0; h -= 4) { // loop_horiz_v | 80 for (; h > 0; h -= 4) { // loop_horiz_v |
96 s = src; | 81 s = src; |
97 d24u8 = vld1_u8(s); | 82 d24u8 = vld1_u8(s); |
98 s += src_stride; | 83 s += src_stride; |
99 d25u8 = vld1_u8(s); | 84 d25u8 = vld1_u8(s); |
100 s += src_stride; | 85 s += src_stride; |
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
264 uint32x2_t d2u32, d3u32, d6u32, d7u32; | 249 uint32x2_t d2u32, d3u32, d6u32, d7u32; |
265 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; | 250 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; |
266 uint8x16_t q1u8, q3u8; | 251 uint8x16_t q1u8, q3u8; |
267 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; | 252 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; |
268 int16x4_t d24s16, d25s16, d26s16, d27s16; | 253 int16x4_t d24s16, d25s16, d26s16, d27s16; |
269 uint16x4_t d2u16, d3u16, d4u16, d5u16; | 254 uint16x4_t d2u16, d3u16, d4u16, d5u16; |
270 int16x8_t q0s16; | 255 int16x8_t q0s16; |
271 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 256 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
272 int32x4_t q1s32, q2s32, q14s32, q15s32; | 257 int32x4_t q1s32, q2s32, q14s32, q15s32; |
273 | 258 |
274 if (y_step_q4 != 16) { | 259 assert(y_step_q4 == 16); |
275 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | |
276 filter_x, x_step_q4, | |
277 filter_y, y_step_q4, w, h); | |
278 return; | |
279 } | |
280 | 260 |
281 src -= src_stride * 3; | 261 src -= src_stride * 3; |
282 q0s16 = vld1q_s16(filter_y); | 262 q0s16 = vld1q_s16(filter_y); |
283 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h | 263 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h |
284 s = src; | 264 s = src; |
285 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); | 265 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); |
286 s += src_stride; | 266 s += src_stride; |
287 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); | 267 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); |
288 s += src_stride; | 268 s += src_stride; |
289 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); | 269 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
384 | 364 |
385 q8u16 = q10u16; | 365 q8u16 = q10u16; |
386 d18s16 = d22s16; | 366 d18s16 = d22s16; |
387 d19s16 = d24s16; | 367 d19s16 = d24s16; |
388 q10u16 = q13u16; | 368 q10u16 = q13u16; |
389 d22s16 = d25s16; | 369 d22s16 = d25s16; |
390 } | 370 } |
391 } | 371 } |
392 return; | 372 return; |
393 } | 373 } |
OLD | NEW |