| OLD | NEW | 
|---|
| (Empty) |  | 
|  | 1 commit e7e8afea42721f13a8c7f9bd7cc836a8612d5d10 | 
|  | 2 Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com> | 
|  | 3 Date:   Mon Jan 30 15:30:38 2017 -0800 | 
|  | 4 | 
|  | 5     NEON implementation for Adler32 | 
|  | 6 | 
|  | 7     The checksum is calculated in the uncompressed PNG data | 
|  | 8     and can be made much faster by using SIMD. | 
|  | 9 | 
|  | 10     Tests in ARMv8 yielded an improvement of about 3x | 
|  | 11     (e.g. walltime was 350ms x 125ms for a 4096x4096 bytes | 
|  | 12     executed 30 times). | 
|  | 13 | 
|  | 14 diff --git a/third_party/zlib/BUILD.gn b/third_party/zlib/BUILD.gn | 
|  | 15 index 5086563..09dfa5f 100644 | 
|  | 16 --- a/third_party/zlib/BUILD.gn | 
|  | 17 +++ b/third_party/zlib/BUILD.gn | 
|  | 18 @@ -73,6 +73,11 @@ static_library("zlib") { | 
|  | 19 | 
|  | 20    if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) { | 
|  | 21      sources += [ "x86.c" ] | 
|  | 22 +  } else if (current_cpu == "arm" || current_cpu == "arm64") { | 
|  | 23 +    sources += [ | 
|  | 24 +      "neon_adler32.c", | 
|  | 25 +      "neon_adler32.h", | 
|  | 26 +    ] | 
|  | 27    } | 
|  | 28 | 
|  | 29    configs -= [ "//build/config/compiler:chromium_code" ] | 
|  | 30 diff --git a/third_party/zlib/adler32.c b/third_party/zlib/adler32.c | 
|  | 31 index d0be438..26dad16 100644 | 
|  | 32 --- a/third_party/zlib/adler32.c | 
|  | 33 +++ b/third_party/zlib/adler32.c | 
|  | 34 @@ -6,6 +6,9 @@ | 
|  | 35  /* @(#) $Id$ */ | 
|  | 36 | 
|  | 37  #include "zutil.h" | 
|  | 38 +#ifdef __ARM_NEON__ | 
|  | 39 +#include "neon_adler32.h" | 
|  | 40 +#endif | 
|  | 41 | 
|  | 42  local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); | 
|  | 43 | 
|  | 44 @@ -65,6 +68,12 @@ uLong ZEXPORT adler32_z(adler, buf, len) | 
|  | 45      const Bytef *buf; | 
|  | 46      z_size_t len; | 
|  | 47  { | 
|  | 48 +#ifdef __ARM_NEON__ | 
|  | 49 +    if (len > 31) { | 
|  | 50 +      return NEON_adler32(adler, buf, len); | 
|  | 51 +    } | 
|  | 52 +#endif | 
|  | 53 + | 
|  | 54      unsigned long sum2; | 
|  | 55      unsigned n; | 
|  | 56 | 
|  | 57 diff --git a/third_party/zlib/neon_adler32.c b/third_party/zlib/neon_adler32.c | 
|  | 58 new file mode 100644 | 
|  | 59 index 0000000..8356842 | 
|  | 60 --- /dev/null | 
|  | 61 +++ b/third_party/zlib/neon_adler32.c | 
|  | 62 @@ -0,0 +1,122 @@ | 
|  | 63 +/* Copyright 2017 The Chromium Authors. All rights reserved. | 
|  | 64 + * Use of this source code is governed by a BSD-style license that can be | 
|  | 65 + * found in the LICENSE file. | 
|  | 66 +*/ | 
|  | 67 + | 
|  | 68 +#include "neon_adler32.h" | 
|  | 69 +#ifdef __ARM_NEON__ | 
|  | 70 +#include <arm_neon.h> | 
|  | 71 + | 
|  | 72 +static void NEON_accum32(uint32_t *s, const unsigned char *buf, | 
|  | 73 +                         unsigned int len) | 
|  | 74 +{ | 
|  | 75 +  static const uint8_t taps[32] = { | 
|  | 76 +    32, 31, 30, 29, 28, 27, 26, 25, | 
|  | 77 +    24, 23, 22, 21, 20, 19, 18, 17, | 
|  | 78 +    16, 15, 14, 13, 12, 11, 10, 9, | 
|  | 79 +    8, 7, 6, 5, 4, 3, 2, 1 }; | 
|  | 80 + | 
|  | 81 +  uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); | 
|  | 82 + | 
|  | 83 +  uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); | 
|  | 84 +  adacc = vsetq_lane_u32(s[0], adacc, 0); | 
|  | 85 +  s2acc = vsetq_lane_u32(s[1], s2acc, 0); | 
|  | 86 + | 
|  | 87 +  while (len >= 2) { | 
|  | 88 +    uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); | 
|  | 89 +    uint16x8_t adler, sum2; | 
|  | 90 +    s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); | 
|  | 91 +    adler = vpaddlq_u8(       d0); | 
|  | 92 +    adler = vpadalq_u8(adler, d1); | 
|  | 93 +    sum2 = vmull_u8(      vget_low_u8(t0), vget_low_u8(d0)); | 
|  | 94 +    sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); | 
|  | 95 +    sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); | 
|  | 96 +    sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); | 
|  | 97 +    adacc = vpadalq_u16(adacc, adler); | 
|  | 98 +    s2acc = vpadalq_u16(s2acc, sum2); | 
|  | 99 +    len -= 2; | 
|  | 100 +    buf += 32; | 
|  | 101 +  } | 
|  | 102 + | 
|  | 103 + | 
|  | 104 +  while (len > 0) { | 
|  | 105 +    uint8x16_t d0 = vld1q_u8(buf); | 
|  | 106 +    uint16x8_t adler, sum2; | 
|  | 107 +    s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); | 
|  | 108 +    adler = vpaddlq_u8(d0); | 
|  | 109 +    sum2 = vmull_u8(      vget_low_u8(t1), vget_low_u8(d0)); | 
|  | 110 +    sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); | 
|  | 111 +    adacc = vpadalq_u16(adacc, adler); | 
|  | 112 +    s2acc = vpadalq_u16(s2acc, sum2); | 
|  | 113 +    buf += 16; | 
|  | 114 +    len--; | 
|  | 115 +  } | 
|  | 116 + | 
|  | 117 +  { | 
|  | 118 +    uint32x2_t adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); | 
|  | 119 +    uint32x2_t s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); | 
|  | 120 +    uint32x2_t as = vpadd_u32(adacc2, s2acc2); | 
|  | 121 +    s[0] = vget_lane_u32(as, 0); | 
|  | 122 +    s[1] = vget_lane_u32(as, 1); | 
|  | 123 +  } | 
|  | 124 +} | 
|  | 125 + | 
|  | 126 +static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, | 
|  | 127 +                             unsigned int len) | 
|  | 128 +{ | 
|  | 129 +  /* Oldie K&R code integration. */ | 
|  | 130 +  unsigned int i; | 
|  | 131 +  for (i = 0; i < len; ++i) { | 
|  | 132 +    pair[0] += buf[i]; | 
|  | 133 +    pair[1] += pair[0]; | 
|  | 134 +  } | 
|  | 135 +} | 
|  | 136 + | 
|  | 137 +unsigned long NEON_adler32(unsigned long adler, const unsigned char *buf, | 
|  | 138 +                           const unsigned int len) | 
|  | 139 +{ | 
|  | 140 +  /* The largest prime smaller than 65536. */ | 
|  | 141 +  const uint32_t M_BASE = 65521; | 
|  | 142 +  /* This is the threshold where doing accumulation may overflow. */ | 
|  | 143 +  const int M_NMAX = 5552; | 
|  | 144 + | 
|  | 145 +  unsigned long sum2; | 
|  | 146 +  uint32_t pair[2]; | 
|  | 147 +  int n = M_NMAX; | 
|  | 148 +  unsigned int done = 0; | 
|  | 149 +  /* Oldie K&R code integration. */ | 
|  | 150 +  unsigned int i; | 
|  | 151 + | 
|  | 152 +  /* Split Adler-32 into component sums, it can be supplied by | 
|  | 153 +   * the caller sites (e.g. in a PNG file). | 
|  | 154 +   */ | 
|  | 155 +  sum2 = (adler >> 16) & 0xffff; | 
|  | 156 +  adler &= 0xffff; | 
|  | 157 +  pair[0] = adler; | 
|  | 158 +  pair[1] = sum2; | 
|  | 159 + | 
|  | 160 +  for (i = 0; i < len; i += n) { | 
|  | 161 +    if ((i + n) > len) | 
|  | 162 +      n = len - i; | 
|  | 163 + | 
|  | 164 +    if (n < 16) | 
|  | 165 +      break; | 
|  | 166 + | 
|  | 167 +    NEON_accum32(pair, buf + i, n / 16); | 
|  | 168 +    pair[0] %= M_BASE; | 
|  | 169 +    pair[1] %= M_BASE; | 
|  | 170 + | 
|  | 171 +    done += (n / 16) * 16; | 
|  | 172 +  } | 
|  | 173 + | 
|  | 174 +  /* Handle the tail elements. */ | 
|  | 175 +  if (done < len) { | 
|  | 176 +    NEON_handle_tail(pair, (buf + done), len - done); | 
|  | 177 +    pair[0] %= M_BASE; | 
|  | 178 +    pair[1] %= M_BASE; | 
|  | 179 +  } | 
|  | 180 + | 
|  | 181 +  /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ | 
|  | 182 +  return (pair[1] << 16) | pair[0]; | 
|  | 183 +} | 
|  | 184 +#endif | 
|  | 185 diff --git a/third_party/zlib/neon_adler32.h b/third_party/zlib/neon_adler32.h | 
|  | 186 new file mode 100644 | 
|  | 187 index 0000000..3043e3d | 
|  | 188 --- /dev/null | 
|  | 189 +++ b/third_party/zlib/neon_adler32.h | 
|  | 190 @@ -0,0 +1,12 @@ | 
|  | 191 +/* Copyright 2017 The Chromium Authors. All rights reserved. | 
|  | 192 + * Use of this source code is governed by a BSD-style license that can be | 
|  | 193 + * found in the LICENSE file. | 
|  | 194 +*/ | 
|  | 195 +#ifndef __NEON_ADLER32__ | 
|  | 196 +#define __NEON_ADLER32__ | 
|  | 197 + | 
|  | 198 +#ifdef __ARM_NEON__ | 
|  | 199 +unsigned long NEON_adler32(unsigned long adler, const unsigned char *buf, | 
|  | 200 +                           const unsigned int len); | 
|  | 201 +#endif | 
|  | 202 +#endif | 
| OLD | NEW | 
|---|