| Index: source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
 | 
| ===================================================================
 | 
| --- source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c	(revision 0)
 | 
| +++ source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c	(revision 0)
 | 
| @@ -0,0 +1,364 @@
 | 
| +/*
 | 
| + *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 | 
| + *
 | 
| + *  Use of this source code is governed by a BSD-style license
 | 
| + *  that can be found in the LICENSE file in the root of the source
 | 
| + *  tree. An additional intellectual property rights grant can be found
 | 
| + *  in the file PATENTS.  All contributing project authors may
 | 
| + *  be found in the AUTHORS file in the root of the source tree.
 | 
| + */
 | 
| +
 | 
| +#include <stdlib.h>
 | 
| +
 | 
| +#include "./vp9_rtcd.h"
 | 
| +#include "vp9/common/vp9_common.h"
 | 
| +#include "vp9/common/vp9_loopfilter.h"
 | 
| +#include "vp9/common/vp9_onyxc_int.h"
 | 
| +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 | 
| +#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
 | 
| +#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
 | 
| +#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
 | 
| +
 | 
| +#if HAVE_DSPR2
 | 
| +void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s,
 | 
| +                                           int pitch,
 | 
| +                                           const uint8_t *blimit,
 | 
| +                                           const uint8_t *limit,
 | 
| +                                           const uint8_t *thresh,
 | 
| +                                           int count) {
 | 
| +  uint8_t   i;
 | 
| +  uint32_t  mask;
 | 
| +  uint32_t  hev;
 | 
| +  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
 | 
| +  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
 | 
| +  uint32_t  thresh_vec, flimit_vec, limit_vec;
 | 
| +  uint32_t  uflimit, ulimit, uthresh;
 | 
| +
 | 
| +  uflimit = *blimit;
 | 
| +  ulimit = *limit;
 | 
| +  uthresh = *thresh;
 | 
| +
 | 
| +  /* create quad-byte */
 | 
| +  __asm__ __volatile__ (
 | 
| +      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
 | 
| +      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
 | 
| +      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
 | 
| +
 | 
| +      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
 | 
| +        [limit_vec] "=r" (limit_vec)
 | 
| +      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
| +  );
 | 
| +
 | 
| +  /* prefetch data for store */
 | 
| +  vp9_prefetch_store(s);
 | 
| +
 | 
| +  /* loop filter designed to work using chars so that we can make maximum use
 | 
| +     of 8 bit simd instructions. */
 | 
| +  for (i = 0; i < 2; i++) {
 | 
| +    sm1 = s - (pitch << 2);
 | 
| +    s0 = sm1 + pitch;
 | 
| +    s1 = s0 + pitch;
 | 
| +    s2 = s - pitch;
 | 
| +    s3 = s;
 | 
| +    s4 = s + pitch;
 | 
| +    s5 = s4 + pitch;
 | 
| +    s6 = s5 + pitch;
 | 
| +
 | 
| +    __asm__ __volatile__ (
 | 
| +        "lw     %[p1],  (%[s1])    \n\t"
 | 
| +        "lw     %[p2],  (%[s2])    \n\t"
 | 
| +        "lw     %[p3],  (%[s3])    \n\t"
 | 
| +        "lw     %[p4],  (%[s4])    \n\t"
 | 
| +
 | 
| +        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
 | 
| +        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
| +    );
 | 
| +
 | 
| +    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
 | 
| +       mask will be zero and filtering is not needed */
 | 
| +    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
 | 
| +      __asm__ __volatile__ (
 | 
| +          "lw       %[pm1], (%[sm1])   \n\t"
 | 
| +          "lw       %[p0],  (%[s0])    \n\t"
 | 
| +          "lw       %[p5],  (%[s5])    \n\t"
 | 
| +          "lw       %[p6],  (%[s6])    \n\t"
 | 
| +
 | 
| +          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
 | 
| +            [p6] "=&r" (p6)
 | 
| +          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
 | 
| +      );
 | 
| +
 | 
| +      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
 | 
| +                                pm1, p0, p3, p4, p5, p6,
 | 
| +                                thresh_vec, &hev, &mask);
 | 
| +
 | 
| +      /* if mask == 0 do filtering is not needed */
 | 
| +      if (mask) {
 | 
| +        /* filtering */
 | 
| +        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "sw     %[p1],  (%[s1])    \n\t"
 | 
| +            "sw     %[p2],  (%[s2])    \n\t"
 | 
| +            "sw     %[p3],  (%[s3])    \n\t"
 | 
| +            "sw     %[p4],  (%[s4])    \n\t"
 | 
| +
 | 
| +            :
 | 
| +            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
 | 
| +              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
 | 
| +        );
 | 
| +      }
 | 
| +    }
 | 
| +
 | 
| +    s = s + 4;
 | 
| +  }
 | 
| +}
 | 
| +
 | 
| +void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
 | 
| +                                         int pitch,
 | 
| +                                         const uint8_t *blimit,
 | 
| +                                         const uint8_t *limit,
 | 
| +                                         const uint8_t *thresh,
 | 
| +                                         int count) {
 | 
| +  uint8_t   i;
 | 
| +  uint32_t  mask, hev;
 | 
| +  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
 | 
| +  uint8_t   *s1, *s2, *s3, *s4;
 | 
| +  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
 | 
| +  uint32_t  thresh_vec, flimit_vec, limit_vec;
 | 
| +  uint32_t  uflimit, ulimit, uthresh;
 | 
| +
 | 
| +  uflimit = *blimit;
 | 
| +  ulimit = *limit;
 | 
| +  uthresh = *thresh;
 | 
| +
 | 
| +  /* create quad-byte */
 | 
| +  __asm__ __volatile__ (
 | 
| +      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
 | 
| +      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
 | 
| +      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
 | 
| +
 | 
| +      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
 | 
| +        [limit_vec] "=r" (limit_vec)
 | 
| +      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
 | 
| +  );
 | 
| +
 | 
| +  /* prefetch data for store */
 | 
| +  vp9_prefetch_store(s + pitch);
 | 
| +
 | 
| +  for (i = 0; i < 2; i++) {
 | 
| +    s1 = s;
 | 
| +    s2 = s + pitch;
 | 
| +    s3 = s2 + pitch;
 | 
| +    s4 = s3 + pitch;
 | 
| +    s  = s4 + pitch;
 | 
| +
 | 
| +    /* load quad-byte vectors
 | 
| +     * memory is 4 byte aligned
 | 
| +     */
 | 
| +    p2  = *((uint32_t *)(s1 - 4));
 | 
| +    p6  = *((uint32_t *)(s1));
 | 
| +    p1  = *((uint32_t *)(s2 - 4));
 | 
| +    p5  = *((uint32_t *)(s2));
 | 
| +    p0  = *((uint32_t *)(s3 - 4));
 | 
| +    p4  = *((uint32_t *)(s3));
 | 
| +    pm1 = *((uint32_t *)(s4 - 4));
 | 
| +    p3  = *((uint32_t *)(s4));
 | 
| +
 | 
| +    /* transpose pm1, p0, p1, p2 */
 | 
| +    __asm__ __volatile__ (
 | 
| +        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
 | 
| +        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
 | 
| +        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
 | 
| +        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
 | 
| +
 | 
| +        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
 | 
| +        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
 | 
| +        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
 | 
| +        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
 | 
| +
 | 
| +        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
 | 
| +        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
 | 
| +        "append         %[p1],      %[sec3],    16          \n\t"
 | 
| +        "append         %[pm1],     %[sec4],    16          \n\t"
 | 
| +
 | 
| +        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
| +          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
| +          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
 | 
| +          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
| +        :
 | 
| +    );
 | 
| +
 | 
| +    /* transpose p3, p4, p5, p6 */
 | 
| +    __asm__ __volatile__ (
 | 
| +        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
 | 
| +        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
 | 
| +        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
 | 
| +        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
 | 
| +
 | 
| +        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
 | 
| +        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
 | 
| +        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
 | 
| +        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
 | 
| +
 | 
| +        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
 | 
| +        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
 | 
| +        "append         %[p5],      %[sec3],    16          \n\t"
 | 
| +        "append         %[p3],      %[sec4],    16          \n\t"
 | 
| +
 | 
| +        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
 | 
| +          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
 | 
| +          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
 | 
| +          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
 | 
| +        :
 | 
| +    );
 | 
| +
 | 
| +    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
 | 
| +     * mask will be zero and filtering is not needed
 | 
| +     */
 | 
| +    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
 | 
| +      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
 | 
| +                                p0, p3, p4, p5, p6, thresh_vec,
 | 
| +                                &hev, &mask);
 | 
| +
 | 
| +      /* if mask == 0 do filtering is not needed */
 | 
| +      if (mask) {
 | 
| +        /* filtering */
 | 
| +        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 | 
| +
 | 
| +        /* unpack processed 4x4 neighborhood
 | 
| +         * don't use transpose on output data
 | 
| +         * because memory isn't aligned
 | 
| +         */
 | 
| +        __asm__ __volatile__ (
 | 
| +            "sb     %[p4],   1(%[s4])    \n\t"
 | 
| +            "sb     %[p3],   0(%[s4])    \n\t"
 | 
| +            "sb     %[p2],  -1(%[s4])    \n\t"
 | 
| +            "sb     %[p1],  -2(%[s4])    \n\t"
 | 
| +
 | 
| +            :
 | 
| +            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
 | 
| +              [s4] "r" (s4)
 | 
| +        );
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "srl    %[p4],  %[p4],  8     \n\t"
 | 
| +            "srl    %[p3],  %[p3],  8     \n\t"
 | 
| +            "srl    %[p2],  %[p2],  8     \n\t"
 | 
| +            "srl    %[p1],  %[p1],  8     \n\t"
 | 
| +
 | 
| +            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
 | 
| +            :
 | 
| +        );
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "sb     %[p4],   1(%[s3])    \n\t"
 | 
| +            "sb     %[p3],   0(%[s3])    \n\t"
 | 
| +            "sb     %[p2],  -1(%[s3])    \n\t"
 | 
| +            "sb     %[p1],  -2(%[s3])    \n\t"
 | 
| +
 | 
| +            : [p1] "+r" (p1)
 | 
| +            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
 | 
| +        );
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "srl    %[p4],  %[p4],  8     \n\t"
 | 
| +            "srl    %[p3],  %[p3],  8     \n\t"
 | 
| +            "srl    %[p2],  %[p2],  8     \n\t"
 | 
| +            "srl    %[p1],  %[p1],  8     \n\t"
 | 
| +
 | 
| +            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
 | 
| +            :
 | 
| +        );
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "sb     %[p4],   1(%[s2])    \n\t"
 | 
| +            "sb     %[p3],   0(%[s2])    \n\t"
 | 
| +            "sb     %[p2],  -1(%[s2])    \n\t"
 | 
| +            "sb     %[p1],  -2(%[s2])    \n\t"
 | 
| +
 | 
| +            :
 | 
| +            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
 | 
| +              [s2] "r" (s2)
 | 
| +        );
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "srl    %[p4],  %[p4],  8     \n\t"
 | 
| +            "srl    %[p3],  %[p3],  8     \n\t"
 | 
| +            "srl    %[p2],  %[p2],  8     \n\t"
 | 
| +            "srl    %[p1],  %[p1],  8     \n\t"
 | 
| +
 | 
| +            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
 | 
| +            :
 | 
| +        );
 | 
| +
 | 
| +        __asm__ __volatile__ (
 | 
| +            "sb     %[p4],   1(%[s1])    \n\t"
 | 
| +            "sb     %[p3],   0(%[s1])    \n\t"
 | 
| +            "sb     %[p2],  -1(%[s1])    \n\t"
 | 
| +            "sb     %[p1],  -2(%[s1])    \n\t"
 | 
| +
 | 
| +            :
 | 
| +            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
 | 
| +              [s1] "r" (s1)
 | 
| +        );
 | 
| +      }
 | 
| +    }
 | 
| +  }
 | 
| +}
 | 
| +
 | 
| +void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
 | 
| +                                              const uint8_t *blimit0,
 | 
| +                                              const uint8_t *limit0,
 | 
| +                                              const uint8_t *thresh0,
 | 
| +                                              const uint8_t *blimit1,
 | 
| +                                              const uint8_t *limit1,
 | 
| +                                              const uint8_t *thresh1) {
 | 
| +  vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
 | 
| +  vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 | 
| +}
 | 
| +
 | 
| +void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
 | 
| +                                                const uint8_t *blimit0,
 | 
| +                                                const uint8_t *limit0,
 | 
| +                                                const uint8_t *thresh0,
 | 
| +                                                const uint8_t *blimit1,
 | 
| +                                                const uint8_t *limit1,
 | 
| +                                                const uint8_t *thresh1) {
 | 
| +  vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
 | 
| +  vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1,
 | 
| +                                          1);
 | 
| +}
 | 
| +
 | 
| +void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
 | 
| +                                            const uint8_t *blimit0,
 | 
| +                                            const uint8_t *limit0,
 | 
| +                                            const uint8_t *thresh0,
 | 
| +                                            const uint8_t *blimit1,
 | 
| +                                            const uint8_t *limit1,
 | 
| +                                            const uint8_t *thresh1) {
 | 
| +  vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
 | 
| +  vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
 | 
| +                                      1);
 | 
| +}
 | 
| +
 | 
| +void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
 | 
| +                                              const uint8_t *blimit0,
 | 
| +                                              const uint8_t *limit0,
 | 
| +                                              const uint8_t *thresh0,
 | 
| +                                              const uint8_t *blimit1,
 | 
| +                                              const uint8_t *limit1,
 | 
| +                                              const uint8_t *thresh1) {
 | 
| +  vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
 | 
| +  vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
 | 
| +                                       1);
 | 
| +}
 | 
| +
 | 
| +void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p,
 | 
| +                                         const uint8_t *blimit,
 | 
| +                                         const uint8_t *limit,
 | 
| +                                         const uint8_t *thresh) {
 | 
| +  vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh);
 | 
| +  vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh);
 | 
| +}
 | 
| +#endif  // #if HAVE_DSPR2
 | 
| 
 |