| OLD | NEW | 
| (Empty) |  | 
 |    1 /* | 
 |    2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 
 |    3  * | 
 |    4  *  Use of this source code is governed by a BSD-style license | 
 |    5  *  that can be found in the LICENSE file in the root of the source | 
 |    6  *  tree. An additional intellectual property rights grant can be found | 
 |    7  *  in the file PATENTS.  All contributing project authors may | 
 |    8  *  be found in the AUTHORS file in the root of the source tree. | 
 |    9  */ | 
 |   10  | 
 |   11 #include <stddef.h> | 
 |   12 #include <arm_neon.h> | 
 |   13  | 
 |   14 void vp9_convolve_avg_neon( | 
 |   15         const uint8_t *src,    // r0 | 
 |   16         ptrdiff_t src_stride,  // r1 | 
 |   17         uint8_t *dst,          // r2 | 
 |   18         ptrdiff_t dst_stride,  // r3 | 
 |   19         const int16_t *filter_x, | 
 |   20         int filter_x_stride, | 
 |   21         const int16_t *filter_y, | 
 |   22         int filter_y_stride, | 
 |   23         int w, | 
 |   24         int h) { | 
 |   25     uint8_t *d; | 
 |   26     uint8x8_t d0u8, d1u8, d2u8, d3u8; | 
 |   27     uint32x2_t d0u32, d2u32; | 
 |   28     uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; | 
 |   29     (void)filter_x;  (void)filter_x_stride; | 
 |   30     (void)filter_y;  (void)filter_y_stride; | 
 |   31  | 
 |   32     d = dst; | 
 |   33     if (w > 32) {  // avg64 | 
 |   34         for (; h > 0; h -= 1) { | 
 |   35             q0u8  = vld1q_u8(src); | 
 |   36             q1u8  = vld1q_u8(src + 16); | 
 |   37             q2u8  = vld1q_u8(src + 32); | 
 |   38             q3u8  = vld1q_u8(src + 48); | 
 |   39             src += src_stride; | 
 |   40             q8u8  = vld1q_u8(d); | 
 |   41             q9u8  = vld1q_u8(d + 16); | 
 |   42             q10u8 = vld1q_u8(d + 32); | 
 |   43             q11u8 = vld1q_u8(d + 48); | 
 |   44             d += dst_stride; | 
 |   45  | 
 |   46             q0u8 = vrhaddq_u8(q0u8, q8u8); | 
 |   47             q1u8 = vrhaddq_u8(q1u8, q9u8); | 
 |   48             q2u8 = vrhaddq_u8(q2u8, q10u8); | 
 |   49             q3u8 = vrhaddq_u8(q3u8, q11u8); | 
 |   50  | 
 |   51             vst1q_u8(dst, q0u8); | 
 |   52             vst1q_u8(dst + 16, q1u8); | 
 |   53             vst1q_u8(dst + 32, q2u8); | 
 |   54             vst1q_u8(dst + 48, q3u8); | 
 |   55             dst += dst_stride; | 
 |   56         } | 
 |   57     } else if (w == 32) {  // avg32 | 
 |   58         for (; h > 0; h -= 2) { | 
 |   59             q0u8 = vld1q_u8(src); | 
 |   60             q1u8 = vld1q_u8(src + 16); | 
 |   61             src += src_stride; | 
 |   62             q2u8 = vld1q_u8(src); | 
 |   63             q3u8 = vld1q_u8(src + 16); | 
 |   64             src += src_stride; | 
 |   65             q8u8 = vld1q_u8(d); | 
 |   66             q9u8 = vld1q_u8(d + 16); | 
 |   67             d += dst_stride; | 
 |   68             q10u8 = vld1q_u8(d); | 
 |   69             q11u8 = vld1q_u8(d + 16); | 
 |   70             d += dst_stride; | 
 |   71  | 
 |   72             q0u8 = vrhaddq_u8(q0u8, q8u8); | 
 |   73             q1u8 = vrhaddq_u8(q1u8, q9u8); | 
 |   74             q2u8 = vrhaddq_u8(q2u8, q10u8); | 
 |   75             q3u8 = vrhaddq_u8(q3u8, q11u8); | 
 |   76  | 
 |   77             vst1q_u8(dst, q0u8); | 
 |   78             vst1q_u8(dst + 16, q1u8); | 
 |   79             dst += dst_stride; | 
 |   80             vst1q_u8(dst, q2u8); | 
 |   81             vst1q_u8(dst + 16, q3u8); | 
 |   82             dst += dst_stride; | 
 |   83         } | 
 |   84     } else if (w > 8) {  // avg16 | 
 |   85         for (; h > 0; h -= 2) { | 
 |   86             q0u8 = vld1q_u8(src); | 
 |   87             src += src_stride; | 
 |   88             q1u8 = vld1q_u8(src); | 
 |   89             src += src_stride; | 
 |   90             q2u8 = vld1q_u8(d); | 
 |   91             d += dst_stride; | 
 |   92             q3u8 = vld1q_u8(d); | 
 |   93             d += dst_stride; | 
 |   94  | 
 |   95             q0u8 = vrhaddq_u8(q0u8, q2u8); | 
 |   96             q1u8 = vrhaddq_u8(q1u8, q3u8); | 
 |   97  | 
 |   98             vst1q_u8(dst, q0u8); | 
 |   99             dst += dst_stride; | 
 |  100             vst1q_u8(dst, q1u8); | 
 |  101             dst += dst_stride; | 
 |  102         } | 
 |  103     } else if (w == 8) {  // avg8 | 
 |  104         for (; h > 0; h -= 2) { | 
 |  105             d0u8 = vld1_u8(src); | 
 |  106             src += src_stride; | 
 |  107             d1u8 = vld1_u8(src); | 
 |  108             src += src_stride; | 
 |  109             d2u8 = vld1_u8(d); | 
 |  110             d += dst_stride; | 
 |  111             d3u8 = vld1_u8(d); | 
 |  112             d += dst_stride; | 
 |  113  | 
 |  114             q0u8 = vcombine_u8(d0u8, d1u8); | 
 |  115             q1u8 = vcombine_u8(d2u8, d3u8); | 
 |  116             q0u8 = vrhaddq_u8(q0u8, q1u8); | 
 |  117  | 
 |  118             vst1_u8(dst, vget_low_u8(q0u8)); | 
 |  119             dst += dst_stride; | 
 |  120             vst1_u8(dst, vget_high_u8(q0u8)); | 
 |  121             dst += dst_stride; | 
 |  122         } | 
 |  123     } else {  // avg4 | 
 |  124         for (; h > 0; h -= 2) { | 
 |  125             d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); | 
 |  126             src += src_stride; | 
 |  127             d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); | 
 |  128             src += src_stride; | 
 |  129             d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); | 
 |  130             d += dst_stride; | 
 |  131             d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); | 
 |  132             d += dst_stride; | 
 |  133  | 
 |  134             d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), | 
 |  135                              vreinterpret_u8_u32(d2u32)); | 
 |  136  | 
 |  137             d0u32 = vreinterpret_u32_u8(d0u8); | 
 |  138             vst1_lane_u32((uint32_t *)dst, d0u32, 0); | 
 |  139             dst += dst_stride; | 
 |  140             vst1_lane_u32((uint32_t *)dst, d0u32, 1); | 
 |  141             dst += dst_stride; | 
 |  142         } | 
 |  143     } | 
 |  144     return; | 
 |  145 } | 
| OLD | NEW |