| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include <assert.h> |
| 12 | 13 |
| 13 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 14 #include "./vpx_dsp_rtcd.h" | 15 #include "./vpx_dsp_rtcd.h" |
| 15 #include "vpx/vpx_integer.h" | 16 #include "vpx/vpx_integer.h" |
| 16 #include "vpx_ports/mem.h" | 17 #include "vpx_ports/mem.h" |
| 17 | 18 |
| 18 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, | |
| 19 uint8_t *dst, ptrdiff_t dst_stride, | |
| 20 const int16_t *filter_x, int x_step_q4, | |
| 21 const int16_t *filter_y, int y_step_q4, | |
| 22 int w, int h); | |
| 23 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, | |
| 24 uint8_t *dst, ptrdiff_t dst_stride, | |
| 25 const int16_t *filter_x, int x_step_q4, | |
| 26 const int16_t *filter_y, int y_step_q4, | |
| 27 int w, int h); | |
| 28 | |
| 29 static INLINE int32x4_t MULTIPLY_BY_Q0( | 19 static INLINE int32x4_t MULTIPLY_BY_Q0( |
| 30 int16x4_t dsrc0, | 20 int16x4_t dsrc0, |
| 31 int16x4_t dsrc1, | 21 int16x4_t dsrc1, |
| 32 int16x4_t dsrc2, | 22 int16x4_t dsrc2, |
| 33 int16x4_t dsrc3, | 23 int16x4_t dsrc3, |
| 34 int16x4_t dsrc4, | 24 int16x4_t dsrc4, |
| 35 int16x4_t dsrc5, | 25 int16x4_t dsrc5, |
| 36 int16x4_t dsrc6, | 26 int16x4_t dsrc6, |
| 37 int16x4_t dsrc7, | 27 int16x4_t dsrc7, |
| 38 int16x8_t q0s16) { | 28 int16x8_t q0s16) { |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 75 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; | 65 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; |
| 76 int16x8_t q0s16; | 66 int16x8_t q0s16; |
| 77 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 67 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
| 78 int32x4_t q1s32, q2s32, q14s32, q15s32; | 68 int32x4_t q1s32, q2s32, q14s32, q15s32; |
| 79 uint16x8x2_t q0x2u16; | 69 uint16x8x2_t q0x2u16; |
| 80 uint8x8x2_t d0x2u8, d1x2u8; | 70 uint8x8x2_t d0x2u8, d1x2u8; |
| 81 uint32x2x2_t d0x2u32; | 71 uint32x2x2_t d0x2u32; |
| 82 uint16x4x2_t d0x2u16, d1x2u16; | 72 uint16x4x2_t d0x2u16, d1x2u16; |
| 83 uint32x4x2_t q0x2u32; | 73 uint32x4x2_t q0x2u32; |
| 84 | 74 |
| 85 if (x_step_q4 != 16) { | 75 assert(x_step_q4 == 16); |
| 86 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | |
| 87 filter_x, x_step_q4, | |
| 88 filter_y, y_step_q4, w, h); | |
| 89 return; | |
| 90 } | |
| 91 | 76 |
| 92 q0s16 = vld1q_s16(filter_x); | 77 q0s16 = vld1q_s16(filter_x); |
| 93 | 78 |
| 94 src -= 3; // adjust for taps | 79 src -= 3; // adjust for taps |
| 95 for (; h > 0; h -= 4, | 80 for (; h > 0; h -= 4, |
| 96 src += src_stride * 4, | 81 src += src_stride * 4, |
| 97 dst += dst_stride * 4) { // loop_horiz_v | 82 dst += dst_stride * 4) { // loop_horiz_v |
| 98 s = src; | 83 s = src; |
| 99 d24u8 = vld1_u8(s); | 84 d24u8 = vld1_u8(s); |
| 100 s += src_stride; | 85 s += src_stride; |
| (...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 248 uint8_t *d; | 233 uint8_t *d; |
| 249 uint32x2_t d2u32, d3u32; | 234 uint32x2_t d2u32, d3u32; |
| 250 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; | 235 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; |
| 251 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; | 236 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; |
| 252 int16x4_t d24s16, d25s16, d26s16, d27s16; | 237 int16x4_t d24s16, d25s16, d26s16, d27s16; |
| 253 uint16x4_t d2u16, d3u16, d4u16, d5u16; | 238 uint16x4_t d2u16, d3u16, d4u16, d5u16; |
| 254 int16x8_t q0s16; | 239 int16x8_t q0s16; |
| 255 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; | 240 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; |
| 256 int32x4_t q1s32, q2s32, q14s32, q15s32; | 241 int32x4_t q1s32, q2s32, q14s32, q15s32; |
| 257 | 242 |
| 258 if (y_step_q4 != 16) { | 243 assert(y_step_q4 == 16); |
| 259 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | |
| 260 filter_x, x_step_q4, | |
| 261 filter_y, y_step_q4, w, h); | |
| 262 return; | |
| 263 } | |
| 264 | 244 |
| 265 src -= src_stride * 3; | 245 src -= src_stride * 3; |
| 266 q0s16 = vld1q_s16(filter_y); | 246 q0s16 = vld1q_s16(filter_y); |
| 267 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h | 247 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h |
| 268 s = src; | 248 s = src; |
| 269 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); | 249 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); |
| 270 s += src_stride; | 250 s += src_stride; |
| 271 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); | 251 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); |
| 272 s += src_stride; | 252 s += src_stride; |
| 273 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); | 253 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 351 | 331 |
| 352 q8u16 = q10u16; | 332 q8u16 = q10u16; |
| 353 d18s16 = d22s16; | 333 d18s16 = d22s16; |
| 354 d19s16 = d24s16; | 334 d19s16 = d24s16; |
| 355 q10u16 = q13u16; | 335 q10u16 = q13u16; |
| 356 d22s16 = d25s16; | 336 d22s16 = d25s16; |
| 357 } | 337 } |
| 358 } | 338 } |
| 359 return; | 339 return; |
| 360 } | 340 } |
| OLD | NEW |