OLD | NEW |
---|---|
(Empty) | |
1 /* Copyright 2017 The Chromium Authors. All rights reserved. | |
2 * Use of this source code is governed by a BSD-style license that can be | |
3 * found in the LICENSE file. | |
4 */ | |
5 | |
6 #include "neon_adler32.h" | |
7 #ifdef __ARM_NEON__ | |
8 #include <arm_neon.h> | |
9 | |
10 static void NEON_accum32(uint32_t *s, const unsigned char *buf, | |
11 unsigned int len) | |
12 { | |
13 static const uint8_t taps[32] = { | |
14 32, 31, 30, 29, 28, 27, 26, 25, | |
15 24, 23, 22, 21, 20, 19, 18, 17, | |
16 16, 15, 14, 13, 12, 11, 10, 9, | |
17 8, 7, 6, 5, 4, 3, 2, 1 }; | |
18 | |
19 uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); | |
20 | |
21 uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); | |
22 adacc = vsetq_lane_u32(s[0], adacc, 0); | |
23 s2acc = vsetq_lane_u32(s[1], s2acc, 0); | |
24 | |
25 while (len >= 2) { | |
26 uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); | |
27 uint16x8_t adler, sum2; | |
28 s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); | |
29 adler = vpaddlq_u8( d0); | |
30 adler = vpadalq_u8(adler, d1); | |
31 sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0)); | |
32 sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); | |
33 sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); | |
34 sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); | |
35 adacc = vpadalq_u16(adacc, adler); | |
36 s2acc = vpadalq_u16(s2acc, sum2); | |
37 len -= 2; | |
38 buf += 32; | |
39 } | |
40 | |
41 | |
42 while (len > 0) { | |
43 uint8x16_t d0 = vld1q_u8(buf); | |
44 uint16x8_t adler, sum2; | |
45 s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); | |
46 adler = vpaddlq_u8(d0); | |
47 sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0)); | |
48 sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); | |
49 adacc = vpadalq_u16(adacc, adler); | |
50 s2acc = vpadalq_u16(s2acc, sum2); | |
51 buf += 16; | |
52 len--; | |
53 } | |
54 | |
55 { | |
56 uint32x2_t adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); | |
57 uint32x2_t s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); | |
58 uint32x2_t as = vpadd_u32(adacc2, s2acc2); | |
59 s[0] = vget_lane_u32(as, 0); | |
60 s[1] = vget_lane_u32(as, 1); | |
61 } | |
62 } | |
63 | |
64 static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, | |
65 unsigned int len) | |
66 { | |
67 /* Oldie K&R code integration. */ | |
68 unsigned int i; | |
69 for (i = 0; i < len; ++i) { | |
70 pair[0] += buf[i]; | |
71 pair[1] += pair[0]; | |
72 } | |
73 } | |
74 | |
75 unsigned long NEON_adler32(unsigned long adler, const unsigned char *buf, | |
76 const unsigned int len) | |
77 { | |
78 /* The largest prime smaller than 65536. */ | |
79 const uint32_t M_BASE = 65521; | |
80 /* This is the threshold where doing accumulation may overflow. */ | |
81 const int M_NMAX = 5552; | |
82 | |
83 unsigned long sum2; | |
84 uint32_t pair[2]; | |
85 int n = M_NMAX; | |
86 unsigned int done = 0; | |
87 /* Oldie K&R code integration. */ | |
88 unsigned int i; | |
89 | |
90 /* Split Adler-32 into component sums, it can be supplied by | |
91 * the caller sites (e.g. in a PNG file). | |
92 */ | |
93 sum2 = (adler >> 16) & 0xffff; | |
94 adler &= 0xffff; | |
95 pair[0] = adler; | |
96 pair[1] = sum2; | |
97 | |
98 for (i = 0; i < len; i += n) { | |
99 if (i + n > len) | |
cavalcantii1
2017/02/04 18:53:01
Add parenthesis i.e. if ((i + n) > len)
| |
100 n = len - i; | |
101 | |
102 if (n < 16) | |
103 break; | |
104 | |
105 NEON_accum32(pair, buf + i, n / 16); | |
106 pair[0] %= M_BASE; | |
107 pair[1] %= M_BASE; | |
108 | |
109 done += (n / 16) * 16; | |
110 } | |
111 | |
112 /* Handle the tail elements. */ | |
113 if (done < len) { | |
114 NEON_handle_tail(pair, (buf + done), len - done); | |
115 pair[0] %= M_BASE; | |
116 pair[1] %= M_BASE; | |
117 } | |
118 | |
119 /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ | |
120 return (pair[1] << 16) | pair[0]; | |
121 } | |
122 #endif | |
OLD | NEW |