OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <stddef.h> |
| 12 #include <arm_neon.h> |
| 13 |
| 14 void vp9_convolve_avg_neon( |
| 15 const uint8_t *src, // r0 |
| 16 ptrdiff_t src_stride, // r1 |
| 17 uint8_t *dst, // r2 |
| 18 ptrdiff_t dst_stride, // r3 |
| 19 const int16_t *filter_x, |
| 20 int filter_x_stride, |
| 21 const int16_t *filter_y, |
| 22 int filter_y_stride, |
| 23 int w, |
| 24 int h) { |
| 25 uint8_t *d; |
| 26 uint8x8_t d0u8, d1u8, d2u8, d3u8; |
| 27 uint32x2_t d0u32, d2u32; |
| 28 uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; |
| 29 (void)filter_x; (void)filter_x_stride; |
| 30 (void)filter_y; (void)filter_y_stride; |
| 31 |
| 32 d = dst; |
| 33 if (w > 32) { // avg64 |
| 34 for (; h > 0; h -= 1) { |
| 35 q0u8 = vld1q_u8(src); |
| 36 q1u8 = vld1q_u8(src + 16); |
| 37 q2u8 = vld1q_u8(src + 32); |
| 38 q3u8 = vld1q_u8(src + 48); |
| 39 src += src_stride; |
| 40 q8u8 = vld1q_u8(d); |
| 41 q9u8 = vld1q_u8(d + 16); |
| 42 q10u8 = vld1q_u8(d + 32); |
| 43 q11u8 = vld1q_u8(d + 48); |
| 44 d += dst_stride; |
| 45 |
| 46 q0u8 = vrhaddq_u8(q0u8, q8u8); |
| 47 q1u8 = vrhaddq_u8(q1u8, q9u8); |
| 48 q2u8 = vrhaddq_u8(q2u8, q10u8); |
| 49 q3u8 = vrhaddq_u8(q3u8, q11u8); |
| 50 |
| 51 vst1q_u8(dst, q0u8); |
| 52 vst1q_u8(dst + 16, q1u8); |
| 53 vst1q_u8(dst + 32, q2u8); |
| 54 vst1q_u8(dst + 48, q3u8); |
| 55 dst += dst_stride; |
| 56 } |
| 57 } else if (w == 32) { // avg32 |
| 58 for (; h > 0; h -= 2) { |
| 59 q0u8 = vld1q_u8(src); |
| 60 q1u8 = vld1q_u8(src + 16); |
| 61 src += src_stride; |
| 62 q2u8 = vld1q_u8(src); |
| 63 q3u8 = vld1q_u8(src + 16); |
| 64 src += src_stride; |
| 65 q8u8 = vld1q_u8(d); |
| 66 q9u8 = vld1q_u8(d + 16); |
| 67 d += dst_stride; |
| 68 q10u8 = vld1q_u8(d); |
| 69 q11u8 = vld1q_u8(d + 16); |
| 70 d += dst_stride; |
| 71 |
| 72 q0u8 = vrhaddq_u8(q0u8, q8u8); |
| 73 q1u8 = vrhaddq_u8(q1u8, q9u8); |
| 74 q2u8 = vrhaddq_u8(q2u8, q10u8); |
| 75 q3u8 = vrhaddq_u8(q3u8, q11u8); |
| 76 |
| 77 vst1q_u8(dst, q0u8); |
| 78 vst1q_u8(dst + 16, q1u8); |
| 79 dst += dst_stride; |
| 80 vst1q_u8(dst, q2u8); |
| 81 vst1q_u8(dst + 16, q3u8); |
| 82 dst += dst_stride; |
| 83 } |
| 84 } else if (w > 8) { // avg16 |
| 85 for (; h > 0; h -= 2) { |
| 86 q0u8 = vld1q_u8(src); |
| 87 src += src_stride; |
| 88 q1u8 = vld1q_u8(src); |
| 89 src += src_stride; |
| 90 q2u8 = vld1q_u8(d); |
| 91 d += dst_stride; |
| 92 q3u8 = vld1q_u8(d); |
| 93 d += dst_stride; |
| 94 |
| 95 q0u8 = vrhaddq_u8(q0u8, q2u8); |
| 96 q1u8 = vrhaddq_u8(q1u8, q3u8); |
| 97 |
| 98 vst1q_u8(dst, q0u8); |
| 99 dst += dst_stride; |
| 100 vst1q_u8(dst, q1u8); |
| 101 dst += dst_stride; |
| 102 } |
| 103 } else if (w == 8) { // avg8 |
| 104 for (; h > 0; h -= 2) { |
| 105 d0u8 = vld1_u8(src); |
| 106 src += src_stride; |
| 107 d1u8 = vld1_u8(src); |
| 108 src += src_stride; |
| 109 d2u8 = vld1_u8(d); |
| 110 d += dst_stride; |
| 111 d3u8 = vld1_u8(d); |
| 112 d += dst_stride; |
| 113 |
| 114 q0u8 = vcombine_u8(d0u8, d1u8); |
| 115 q1u8 = vcombine_u8(d2u8, d3u8); |
| 116 q0u8 = vrhaddq_u8(q0u8, q1u8); |
| 117 |
| 118 vst1_u8(dst, vget_low_u8(q0u8)); |
| 119 dst += dst_stride; |
| 120 vst1_u8(dst, vget_high_u8(q0u8)); |
| 121 dst += dst_stride; |
| 122 } |
| 123 } else { // avg4 |
| 124 for (; h > 0; h -= 2) { |
| 125 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); |
| 126 src += src_stride; |
| 127 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); |
| 128 src += src_stride; |
| 129 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); |
| 130 d += dst_stride; |
| 131 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); |
| 132 d += dst_stride; |
| 133 |
| 134 d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), |
| 135 vreinterpret_u8_u32(d2u32)); |
| 136 |
| 137 d0u32 = vreinterpret_u32_u8(d0u8); |
| 138 vst1_lane_u32((uint32_t *)dst, d0u32, 0); |
| 139 dst += dst_stride; |
| 140 vst1_lane_u32((uint32_t *)dst, d0u32, 1); |
| 141 dst += dst_stride; |
| 142 } |
| 143 } |
| 144 return; |
| 145 } |
OLD | NEW |