| Index: simd/jdsample-altivec.c
 | 
| diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c
 | 
| new file mode 100644
 | 
| index 0000000000000000000000000000000000000000..63d6d8ca1f47205ea35c7422ee6451f7c254bb2c
 | 
| --- /dev/null
 | 
| +++ b/simd/jdsample-altivec.c
 | 
| @@ -0,0 +1,392 @@
 | 
| +/*
 | 
| + * AltiVec optimizations for libjpeg-turbo
 | 
| + *
 | 
| + * Copyright (C) 2015, D. R. Commander.
 | 
| + * All rights reserved.
 | 
| + * This software is provided 'as-is', without any express or implied
 | 
| + * warranty.  In no event will the authors be held liable for any damages
 | 
| + * arising from the use of this software.
 | 
| + *
 | 
| + * Permission is granted to anyone to use this software for any purpose,
 | 
| + * including commercial applications, and to alter it and redistribute it
 | 
| + * freely, subject to the following restrictions:
 | 
| + *
 | 
| + * 1. The origin of this software must not be misrepresented; you must not
 | 
| + *    claim that you wrote the original software. If you use this software
 | 
| + *    in a product, an acknowledgment in the product documentation would be
 | 
| + *    appreciated but is not required.
 | 
| + * 2. Altered source versions must be plainly marked as such, and must not be
 | 
| + *    misrepresented as being the original software.
 | 
| + * 3. This notice may not be removed or altered from any source distribution.
 | 
| + */
 | 
| +
 | 
| +/* CHROMA UPSAMPLING */
 | 
| +
 | 
| +#include "jsimd_altivec.h"
 | 
| +
 | 
| +
 | 
| +void
 | 
| +jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
 | 
| +                                   JDIMENSION downsampled_width,
 | 
| +                                   JSAMPARRAY input_data,
 | 
| +                                   JSAMPARRAY *output_data_ptr)
 | 
| +{
 | 
| +  JSAMPARRAY output_data = *output_data_ptr;
 | 
| +  JSAMPROW inptr, outptr;
 | 
| +  int inrow, incol;
 | 
| +
 | 
| +  __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
 | 
| +    out;
 | 
| +  __vector short this0e, this0o, this0l, this0h, last0l, last0h,
 | 
| +    next0l, next0h, outle, outhe, outlo, outho;
 | 
| +
 | 
| +  /* Constants */
 | 
| +  __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
 | 
| +    last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
 | 
| +    last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
 | 
| +    next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
 | 
| +    next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
 | 
| +#if __BIG_ENDIAN__
 | 
| +    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
 | 
| +#else
 | 
| +    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
 | 
| +#endif
 | 
| +  __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
 | 
| +
 | 
| +  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
 | 
| +    inptr = input_data[inrow];
 | 
| +    outptr = output_data[inrow];
 | 
| +
 | 
| +    if (downsampled_width & 15)
 | 
| +      inptr[downsampled_width] = inptr[downsampled_width - 1];
 | 
| +
 | 
| +    this0 = vec_ld(0, inptr);
 | 
| +    p_last0 = vec_perm(this0, this0, last_index_col0);
 | 
| +    last0 = this0;
 | 
| +
 | 
| +    for (incol = downsampled_width; incol > 0;
 | 
| +         incol -= 16, inptr += 16, outptr += 32) {
 | 
| +
 | 
| +      if (downsampled_width - incol > 0) {
 | 
| +        p_last0 = vec_perm(last0, this0, last_index);
 | 
| +        last0 = this0;
 | 
| +      }
 | 
| +
 | 
| +      if (incol <= 16)
 | 
| +        p_next0 = vec_perm(this0, this0, next_index_lastcol);
 | 
| +      else {
 | 
| +        next0 = vec_ld(16, inptr);
 | 
| +        p_next0 = vec_perm(this0, next0, next_index);
 | 
| +      }
 | 
| +
 | 
| +      this0e = (__vector short)vec_mule(this0, pb_three);
 | 
| +      this0o = (__vector short)vec_mulo(this0, pb_three);
 | 
| +      this0l = vec_mergeh(this0e, this0o);
 | 
| +      this0h = vec_mergel(this0e, this0o);
 | 
| +
 | 
| +      last0l = (__vector short)VEC_UNPACKHU(p_last0);
 | 
| +      last0h = (__vector short)VEC_UNPACKLU(p_last0);
 | 
| +      last0l = vec_add(last0l, pw_one);
 | 
| +
 | 
| +      next0l = (__vector short)VEC_UNPACKHU(p_next0);
 | 
| +      next0h = (__vector short)VEC_UNPACKLU(p_next0);
 | 
| +      next0l = vec_add(next0l, pw_two);
 | 
| +
 | 
| +      outle = vec_add(this0l, last0l);
 | 
| +      outlo = vec_add(this0l, next0l);
 | 
| +      outle = vec_sr(outle, (__vector unsigned short)pw_two);
 | 
| +      outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
 | 
| +
 | 
| +      out = vec_perm((__vector unsigned char)outle,
 | 
| +                     (__vector unsigned char)outlo, merge_pack_index);
 | 
| +      vec_st(out, 0, outptr);
 | 
| +
 | 
| +      if (incol > 8) {
 | 
| +        last0h = vec_add(last0h, pw_one);
 | 
| +        next0h = vec_add(next0h, pw_two);
 | 
| +
 | 
| +        outhe = vec_add(this0h, last0h);
 | 
| +        outho = vec_add(this0h, next0h);
 | 
| +        outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
 | 
| +        outho = vec_sr(outho, (__vector unsigned short)pw_two);
 | 
| +
 | 
| +        out = vec_perm((__vector unsigned char)outhe,
 | 
| +                       (__vector unsigned char)outho, merge_pack_index);
 | 
| +        vec_st(out, 16, outptr);
 | 
| +      }
 | 
| +
 | 
| +      this0 = next0;
 | 
| +    }
 | 
| +  }
 | 
| +}
 | 
| +
 | 
| +
 | 
| +void
 | 
| +jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
 | 
| +                                   JDIMENSION downsampled_width,
 | 
| +                                   JSAMPARRAY input_data,
 | 
| +                                   JSAMPARRAY *output_data_ptr)
 | 
| +{
 | 
| +  JSAMPARRAY output_data = *output_data_ptr;
 | 
| +  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
 | 
| +  int inrow, outrow, incol;
 | 
| +
 | 
| +  __vector unsigned char this_1, this0, this1, out;
 | 
| +  __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
 | 
| +    lastcolsum_1h, lastcolsum1h,
 | 
| +    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
 | 
| +    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
 | 
| +    nextcolsum_1l = {0}, nextcolsum_1h = {0},
 | 
| +    nextcolsum1l = {0}, nextcolsum1h = {0},
 | 
| +    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
 | 
| +    tmpl, tmph, outle, outhe, outlo, outho;
 | 
| +
 | 
| +  /* Constants */
 | 
| +  __vector unsigned char pb_zero = { __16X(0) },
 | 
| +    last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
 | 
| +    last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
 | 
| +    next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
 | 
| +    next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
 | 
| +#if __BIG_ENDIAN__
 | 
| +    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
 | 
| +#else
 | 
| +    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
 | 
| +#endif
 | 
| +  __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
 | 
| +    pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
 | 
| +  __vector unsigned short pw_four = { __8X(4) };
 | 
| +
 | 
| +  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
 | 
| +
 | 
| +    inptr_1 = input_data[inrow - 1];
 | 
| +    inptr0 = input_data[inrow];
 | 
| +    inptr1 = input_data[inrow + 1];
 | 
| +    outptr0 = output_data[outrow++];
 | 
| +    outptr1 = output_data[outrow++];
 | 
| +
 | 
| +    if (downsampled_width & 15) {
 | 
| +      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
 | 
| +      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
 | 
| +      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
 | 
| +    }
 | 
| +
 | 
| +    this0 = vec_ld(0, inptr0);
 | 
| +    this0l = (__vector short)VEC_UNPACKHU(this0);
 | 
| +    this0h = (__vector short)VEC_UNPACKLU(this0);
 | 
| +    this0l = vec_mladd(this0l, pw_three, pw_zero);
 | 
| +    this0h = vec_mladd(this0h, pw_three, pw_zero);
 | 
| +
 | 
| +    this_1 = vec_ld(0, inptr_1);
 | 
| +    this_1l = (__vector short)VEC_UNPACKHU(this_1);
 | 
| +    this_1h = (__vector short)VEC_UNPACKLU(this_1);
 | 
| +    thiscolsum_1l = vec_add(this0l, this_1l);
 | 
| +    thiscolsum_1h = vec_add(this0h, this_1h);
 | 
| +    lastcolsum_1h = thiscolsum_1h;
 | 
| +    p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
 | 
| +    p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
 | 
| +
 | 
| +    this1 = vec_ld(0, inptr1);
 | 
| +    this1l = (__vector short)VEC_UNPACKHU(this1);
 | 
| +    this1h = (__vector short)VEC_UNPACKLU(this1);
 | 
| +    thiscolsum1l = vec_add(this0l, this1l);
 | 
| +    thiscolsum1h = vec_add(this0h, this1h);
 | 
| +    lastcolsum1h = thiscolsum1h;
 | 
| +    p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
 | 
| +    p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
 | 
| +
 | 
| +    for (incol = downsampled_width; incol > 0;
 | 
| +         incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
 | 
| +         outptr0 += 32, outptr1 += 32) {
 | 
| +
 | 
| +      if (downsampled_width - incol > 0) {
 | 
| +        p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
 | 
| +        p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
 | 
| +        p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
 | 
| +        p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
 | 
| +        lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
 | 
| +      }
 | 
| +
 | 
| +      if (incol <= 16) {
 | 
| +        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
 | 
| +        p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
 | 
| +                                   next_index_lastcol);
 | 
| +        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
 | 
| +        p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
 | 
| +                                  next_index_lastcol);
 | 
| +      } else {
 | 
| +        this0 = vec_ld(16, inptr0);
 | 
| +        this0l = (__vector short)VEC_UNPACKHU(this0);
 | 
| +        this0h = (__vector short)VEC_UNPACKLU(this0);
 | 
| +        this0l = vec_mladd(this0l, pw_three, pw_zero);
 | 
| +        this0h = vec_mladd(this0h, pw_three, pw_zero);
 | 
| +
 | 
| +        this_1 = vec_ld(16, inptr_1);
 | 
| +        this_1l = (__vector short)VEC_UNPACKHU(this_1);
 | 
| +        this_1h = (__vector short)VEC_UNPACKLU(this_1);
 | 
| +        nextcolsum_1l = vec_add(this0l, this_1l);
 | 
| +        nextcolsum_1h = vec_add(this0h, this_1h);
 | 
| +        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
 | 
| +        p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
 | 
| +
 | 
| +        this1 = vec_ld(16, inptr1);
 | 
| +        this1l = (__vector short)VEC_UNPACKHU(this1);
 | 
| +        this1h = (__vector short)VEC_UNPACKLU(this1);
 | 
| +        nextcolsum1l = vec_add(this0l, this1l);
 | 
| +        nextcolsum1h = vec_add(this0h, this1h);
 | 
| +        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
 | 
| +        p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
 | 
| +      }
 | 
| +
 | 
| +      /* Process the upper row */
 | 
| +
 | 
| +      tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
 | 
| +      outle = vec_add(tmpl, p_lastcolsum_1l);
 | 
| +      outle = vec_add(outle, pw_eight);
 | 
| +      outle = vec_sr(outle, pw_four);
 | 
| +
 | 
| +      outlo = vec_add(tmpl, p_nextcolsum_1l);
 | 
| +      outlo = vec_add(outlo, pw_seven);
 | 
| +      outlo = vec_sr(outlo, pw_four);
 | 
| +
 | 
| +      out = vec_perm((__vector unsigned char)outle,
 | 
| +                     (__vector unsigned char)outlo, merge_pack_index);
 | 
| +      vec_st(out, 0, outptr0);
 | 
| +
 | 
| +      if (incol > 8) {
 | 
| +        tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
 | 
| +        outhe = vec_add(tmph, p_lastcolsum_1h);
 | 
| +        outhe = vec_add(outhe, pw_eight);
 | 
| +        outhe = vec_sr(outhe, pw_four);
 | 
| +
 | 
| +        outho = vec_add(tmph, p_nextcolsum_1h);
 | 
| +        outho = vec_add(outho, pw_seven);
 | 
| +        outho = vec_sr(outho, pw_four);
 | 
| +
 | 
| +        out = vec_perm((__vector unsigned char)outhe,
 | 
| +                       (__vector unsigned char)outho, merge_pack_index);
 | 
| +        vec_st(out, 16, outptr0);
 | 
| +      }
 | 
| +
 | 
| +      /* Process the lower row */
 | 
| +
 | 
| +      tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
 | 
| +      outle = vec_add(tmpl, p_lastcolsum1l);
 | 
| +      outle = vec_add(outle, pw_eight);
 | 
| +      outle = vec_sr(outle, pw_four);
 | 
| +
 | 
| +      outlo = vec_add(tmpl, p_nextcolsum1l);
 | 
| +      outlo = vec_add(outlo, pw_seven);
 | 
| +      outlo = vec_sr(outlo, pw_four);
 | 
| +
 | 
| +      out = vec_perm((__vector unsigned char)outle,
 | 
| +                     (__vector unsigned char)outlo, merge_pack_index);
 | 
| +      vec_st(out, 0, outptr1);
 | 
| +
 | 
| +      if (incol > 8) {
 | 
| +        tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
 | 
| +        outhe = vec_add(tmph, p_lastcolsum1h);
 | 
| +        outhe = vec_add(outhe, pw_eight);
 | 
| +        outhe = vec_sr(outhe, pw_four);
 | 
| +
 | 
| +        outho = vec_add(tmph, p_nextcolsum1h);
 | 
| +        outho = vec_add(outho, pw_seven);
 | 
| +        outho = vec_sr(outho, pw_four);
 | 
| +
 | 
| +        out = vec_perm((__vector unsigned char)outhe,
 | 
| +                       (__vector unsigned char)outho, merge_pack_index);
 | 
| +        vec_st(out, 16, outptr1);
 | 
| +      }
 | 
| +
 | 
| +      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
 | 
| +      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
 | 
| +    }
 | 
| +  }
 | 
| +}
 | 
| +
 | 
| +
 | 
| +/* These are rarely used (mainly just for decompressing YCCK images) */
 | 
| +
 | 
| +void
 | 
| +jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
 | 
| +                             JDIMENSION output_width,
 | 
| +                             JSAMPARRAY input_data,
 | 
| +                             JSAMPARRAY *output_data_ptr)
 | 
| +{
 | 
| +  JSAMPARRAY output_data = *output_data_ptr;
 | 
| +  JSAMPROW inptr, outptr;
 | 
| +  int inrow, incol;
 | 
| +
 | 
| +  __vector unsigned char in, inl, inh;
 | 
| +
 | 
| +  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
 | 
| +    inptr = input_data[inrow];
 | 
| +    outptr = output_data[inrow];
 | 
| +
 | 
| +    for (incol = (output_width + 31) & (~31); incol > 0;
 | 
| +         incol -= 64, inptr += 32, outptr += 64) {
 | 
| +
 | 
| +      in = vec_ld(0, inptr);
 | 
| +      inl = vec_mergeh(in, in);
 | 
| +      inh = vec_mergel(in, in);
 | 
| +
 | 
| +      vec_st(inl, 0, outptr);
 | 
| +      vec_st(inh, 16, outptr);
 | 
| +
 | 
| +      if (incol > 32) {
 | 
| +        in = vec_ld(16, inptr);
 | 
| +        inl = vec_mergeh(in, in);
 | 
| +        inh = vec_mergel(in, in);
 | 
| +
 | 
| +        vec_st(inl, 32, outptr);
 | 
| +        vec_st(inh, 48, outptr);
 | 
| +      }
 | 
| +    }
 | 
| +  }
 | 
| +}
 | 
| +
 | 
| +
 | 
| +void
 | 
| +jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
 | 
| +                             JDIMENSION output_width,
 | 
| +                             JSAMPARRAY input_data,
 | 
| +                             JSAMPARRAY *output_data_ptr)
 | 
| +{
 | 
| +  JSAMPARRAY output_data = *output_data_ptr;
 | 
| +  JSAMPROW inptr, outptr0, outptr1;
 | 
| +  int inrow, outrow, incol;
 | 
| +
 | 
| +  __vector unsigned char in, inl, inh;
 | 
| +
 | 
| +  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
 | 
| +
 | 
| +    inptr = input_data[inrow];
 | 
| +    outptr0 = output_data[outrow++];
 | 
| +    outptr1 = output_data[outrow++];
 | 
| +
 | 
| +    for (incol = (output_width + 31) & (~31); incol > 0;
 | 
| +         incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
 | 
| +
 | 
| +      in = vec_ld(0, inptr);
 | 
| +      inl = vec_mergeh(in, in);
 | 
| +      inh = vec_mergel(in, in);
 | 
| +
 | 
| +      vec_st(inl, 0, outptr0);
 | 
| +      vec_st(inl, 0, outptr1);
 | 
| +
 | 
| +      vec_st(inh, 16, outptr0);
 | 
| +      vec_st(inh, 16, outptr1);
 | 
| +
 | 
| +      if (incol > 32) {
 | 
| +        in = vec_ld(16, inptr);
 | 
| +        inl = vec_mergeh(in, in);
 | 
| +        inh = vec_mergel(in, in);
 | 
| +
 | 
| +        vec_st(inl, 32, outptr0);
 | 
| +        vec_st(inl, 32, outptr1);
 | 
| +
 | 
| +        vec_st(inh, 48, outptr0);
 | 
| +        vec_st(inh, 48, outptr1);
 | 
| +      }
 | 
| +    }
 | 
| +  }
 | 
| +}
 | 
| 
 |