| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include <assert.h> |
| 12 | 13 |
| 13 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 14 #include "./vpx_dsp_rtcd.h" | 15 #include "./vpx_dsp_rtcd.h" |
| 15 #include "vpx/vpx_integer.h" | 16 #include "vpx/vpx_integer.h" |
| 16 #include "vpx_ports/mem.h" | 17 #include "vpx_ports/mem.h" |
| 17 | 18 |
| 18 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |
| 19 uint8_t *dst, ptrdiff_t dst_stride, | |
| 20 const int16_t *filter_x, int x_step_q4, | |
| 21 const int16_t *filter_y, int y_step_q4, | |
| 22 int w, int h); | |
| 23 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |
| 24 uint8_t *dst, ptrdiff_t dst_stride, | |
| 25 const int16_t *filter_x, int x_step_q4, | |
| 26 const int16_t *filter_y, int y_step_q4, | |
| 27 int w, int h); | |
| 28 | |
| 29 static INLINE int32x4_t MULTIPLY_BY_Q0( | 19 static INLINE int32x4_t MULTIPLY_BY_Q0( |
| 30 int16x4_t dsrc0, | 20 int16x4_t dsrc0, |
| 31 int16x4_t dsrc1, | 21 int16x4_t dsrc1, |
| 32 int16x4_t dsrc2, | 22 int16x4_t dsrc2, |
| 33 int16x4_t dsrc3, | 23 int16x4_t dsrc3, |
| 34 int16x4_t dsrc4, | 24 int16x4_t dsrc4, |
| 35 int16x4_t dsrc5, | 25 int16x4_t dsrc5, |
| 36 int16x4_t dsrc6, | 26 int16x4_t dsrc6, |
| 37 int16x4_t dsrc7, | 27 int16x4_t dsrc7, |
| 38 int16x8_t q0s16) { | 28 int16x8_t q0s16) { |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 75 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; | 65 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; |
| 76 int16x8_t q0s16; | 66 int16x8_t q0s16; |
| 77 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 67 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
| 78 int32x4_t q1s32, q2s32, q14s32, q15s32; | 68 int32x4_t q1s32, q2s32, q14s32, q15s32; |
| 79 uint16x8x2_t q0x2u16; | 69 uint16x8x2_t q0x2u16; |
| 80 uint8x8x2_t d0x2u8, d1x2u8; | 70 uint8x8x2_t d0x2u8, d1x2u8; |
| 81 uint32x2x2_t d0x2u32; | 71 uint32x2x2_t d0x2u32; |
| 82 uint16x4x2_t d0x2u16, d1x2u16; | 72 uint16x4x2_t d0x2u16, d1x2u16; |
| 83 uint32x4x2_t q0x2u32; | 73 uint32x4x2_t q0x2u32; |
| 84 | 74 |
| 85 if (x_step_q4 != 16) { | 75 assert(x_step_q4 == 16); |
| 86 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | |
| 87 filter_x, x_step_q4, | |
| 88 filter_y, y_step_q4, w, h); | |
| 89 return; | |
| 90 } | |
| 91 | 76 |
| 92 q0s16 = vld1q_s16(filter_x); | 77 q0s16 = vld1q_s16(filter_x); |
| 93 | 78 |
| 94 src -= 3; // adjust for taps | 79 src -= 3; // adjust for taps |
| 95 for (; h > 0; h -= 4) { // loop_horiz_v | 80 for (; h > 0; h -= 4) { // loop_horiz_v |
| 96 s = src; | 81 s = src; |
| 97 d24u8 = vld1_u8(s); | 82 d24u8 = vld1_u8(s); |
| 98 s += src_stride; | 83 s += src_stride; |
| 99 d25u8 = vld1_u8(s); | 84 d25u8 = vld1_u8(s); |
| 100 s += src_stride; | 85 s += src_stride; |
| (...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 264 uint32x2_t d2u32, d3u32, d6u32, d7u32; | 249 uint32x2_t d2u32, d3u32, d6u32, d7u32; |
| 265 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; | 250 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; |
| 266 uint8x16_t q1u8, q3u8; | 251 uint8x16_t q1u8, q3u8; |
| 267 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; | 252 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; |
| 268 int16x4_t d24s16, d25s16, d26s16, d27s16; | 253 int16x4_t d24s16, d25s16, d26s16, d27s16; |
| 269 uint16x4_t d2u16, d3u16, d4u16, d5u16; | 254 uint16x4_t d2u16, d3u16, d4u16, d5u16; |
| 270 int16x8_t q0s16; | 255 int16x8_t q0s16; |
| 271 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 256 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
| 272 int32x4_t q1s32, q2s32, q14s32, q15s32; | 257 int32x4_t q1s32, q2s32, q14s32, q15s32; |
| 273 | 258 |
| 274 if (y_step_q4 != 16) { | 259 assert(y_step_q4 == 16); |
| 275 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | |
| 276 filter_x, x_step_q4, | |
| 277 filter_y, y_step_q4, w, h); | |
| 278 return; | |
| 279 } | |
| 280 | 260 |
| 281 src -= src_stride * 3; | 261 src -= src_stride * 3; |
| 282 q0s16 = vld1q_s16(filter_y); | 262 q0s16 = vld1q_s16(filter_y); |
| 283 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h | 263 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h |
| 284 s = src; | 264 s = src; |
| 285 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); | 265 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); |
| 286 s += src_stride; | 266 s += src_stride; |
| 287 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); | 267 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); |
| 288 s += src_stride; | 268 s += src_stride; |
| 289 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); | 269 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 384 | 364 |
| 385 q8u16 = q10u16; | 365 q8u16 = q10u16; |
| 386 d18s16 = d22s16; | 366 d18s16 = d22s16; |
| 387 d19s16 = d24s16; | 367 d19s16 = d24s16; |
| 388 q10u16 = q13u16; | 368 q10u16 = q13u16; |
| 389 d22s16 = d25s16; | 369 d22s16 = d25s16; |
| 390 } | 370 } |
| 391 } | 371 } |
| 392 return; | 372 return; |
| 393 } | 373 } |
| OLD | NEW |