OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. |
| 2 // |
| 3 // Permission is hereby granted, free of charge, to any person |
| 4 // obtaining a copy of this software and associated documentation |
| 5 // files (the "Software"), to deal in the Software without |
| 6 // restriction, including without limitation the rights to use, |
| 7 // copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 8 // copies of the Software, and to permit persons to whom the |
| 9 // Software is furnished to do so, subject to the following |
| 10 // conditions: |
| 11 // |
| 12 // The above copyright notice and this permission notice shall be |
| 13 // included in all copies or substantial portions of the Software. |
| 14 // |
| 15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 17 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| 19 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 20 // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 21 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 22 // OTHER DEALINGS IN THE SOFTWARE. |
| 23 // |
| 24 // ChangeLog: |
| 25 // 2016-07-22 - Initial commit and adaption to use PagedArray. |
| 26 // --Samuel Huang <huangs@chromium.org> |
| 27 |
| 28 #include "courgette/third_party/divsufsort/divsufsort_private.h" |
| 29 |
| 30 #include <stdlib.h> |
| 31 |
| 32 #define BUCKET_A_SIZE (ALPHABET_SIZE) |
| 33 #define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) |
| 34 |
| 35 #define BUCKET_A(_c0) bucket_A[(_c0)] |
| 36 #if ALPHABET_SIZE == 256 |
| 37 #define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) |
| 38 #define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) |
| 39 #else |
| 40 #define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) |
| 41 #define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) |
| 42 #endif |
| 43 |
| 44 namespace divsuf { |
| 45 |
| 46 /*- Private Functions -*/ |
| 47 |
| 48 namespace { |
| 49 |
| 50 /* Sorts suffixes of type B*. */ |
| 51 saidx_t |
| 52 sort_typeBstar(const sauchar_t *T, saidx_it SA, |
| 53 saidx_t *bucket_A, saidx_t *bucket_B, |
| 54 saidx_t n) { |
| 55 saidx_it PAb, ISAb, buf; |
| 56 saidx_t i, j, k, t, m, bufsize; |
| 57 saint_t c0, c1; |
| 58 |
| 59 /* Initialize bucket arrays. */ |
| 60 for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } |
| 61 for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } |
| 62 |
| 63 /* Count the number of occurrences of the first one or two characters of each |
| 64 type A, B and B* suffix. Moreover, store the beginning position of all |
| 65 type B* suffixes into the array SA. */ |
| 66 for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { |
| 67 /* type A suffix. */ |
| 68 do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); |
| 69 if(0 <= i) { |
| 70 /* type B* suffix. */ |
| 71 ++BUCKET_BSTAR(c0, c1); |
| 72 SA[--m] = i; |
| 73 /* type B suffix. */ |
| 74 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { |
| 75 ++BUCKET_B(c0, c1); |
| 76 } |
| 77 } |
| 78 } |
| 79 m = n - m; |
| 80 /* |
| 81 note: |
| 82 A type B* suffix is lexicographically smaller than a type B suffix that |
| 83 begins with the same first two characters. |
| 84 */ |
| 85 |
| 86 /* Calculate the index of start/end point of each bucket. */ |
| 87 for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { |
| 88 t = i + BUCKET_A(c0); |
| 89 BUCKET_A(c0) = i + j; /* start point */ |
| 90 i = t + BUCKET_B(c0, c0); |
| 91 for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { |
| 92 j += BUCKET_BSTAR(c0, c1); |
| 93 BUCKET_BSTAR(c0, c1) = j; /* end point */ |
| 94 i += BUCKET_B(c0, c1); |
| 95 } |
| 96 } |
| 97 |
| 98 if(0 < m) { |
| 99 /* Sort the type B* suffixes by their first two characters. */ |
| 100 PAb = SA + n - m; ISAb = SA + m; |
| 101 for(i = m - 2; 0 <= i; --i) { |
| 102 t = PAb[i], c0 = T[t], c1 = T[t + 1]; |
| 103 SA[--BUCKET_BSTAR(c0, c1)] = i; |
| 104 } |
| 105 t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; |
| 106 SA[--BUCKET_BSTAR(c0, c1)] = m - 1; |
| 107 |
| 108 /* Sort the type B* substrings using sssort. */ |
| 109 buf = SA + m, bufsize = n - (2 * m); |
| 110 for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { |
| 111 for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { |
| 112 i = BUCKET_BSTAR(c0, c1); |
| 113 if(1 < (j - i)) { |
| 114 sssort(T, PAb, SA + i, SA + j, |
| 115 buf, bufsize, 2, n, *(SA + i) == (m - 1)); |
| 116 } |
| 117 } |
| 118 } |
| 119 |
| 120 /* Compute ranks of type B* substrings. */ |
| 121 for(i = m - 1; 0 <= i; --i) { |
| 122 if(0 <= SA[i]) { |
| 123 j = i; |
| 124 do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); |
| 125 SA[i + 1] = i - j; |
| 126 if(i <= 0) { break; } |
| 127 } |
| 128 j = i; |
| 129 do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); |
| 130 ISAb[SA[i]] = j; |
| 131 } |
| 132 |
| 133 /* Construct the inverse suffix array of type B* suffixes using trsort. */ |
| 134 trsort(ISAb, SA, m, 1); |
| 135 |
| 136 /* Set the sorted order of tyoe B* suffixes. */ |
| 137 for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { |
| 138 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } |
| 139 if(0 <= i) { |
| 140 t = i; |
| 141 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } |
| 142 SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; |
| 143 } |
| 144 } |
| 145 |
| 146 /* Calculate the index of start/end point of each bucket. */ |
| 147 BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ |
| 148 for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { |
| 149 i = BUCKET_A(c0 + 1) - 1; |
| 150 for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { |
| 151 t = i - BUCKET_B(c0, c1); |
| 152 BUCKET_B(c0, c1) = i; /* end point */ |
| 153 |
| 154 /* Move all type B* suffixes to the correct position. */ |
| 155 for(i = t, j = BUCKET_BSTAR(c0, c1); |
| 156 j <= k; |
| 157 --i, --k) { SA[i] = SA[k]; } |
| 158 } |
| 159 BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ |
| 160 BUCKET_B(c0, c0) = i; /* end point */ |
| 161 } |
| 162 } |
| 163 |
| 164 return m; |
| 165 } |
| 166 |
| 167 /* Constructs the suffix array by using the sorted order of type B* suffixes. */ |
| 168 void |
| 169 construct_SA(const sauchar_t *T, saidx_it SA, |
| 170 saidx_t *bucket_A, saidx_t *bucket_B, |
| 171 saidx_t n, saidx_t m) { |
| 172 saidx_it i, j, k; |
| 173 saidx_t s; |
| 174 saint_t c0, c1, c2; |
| 175 |
| 176 if(0 < m) { |
| 177 /* Construct the sorted order of type B suffixes by using |
| 178 the sorted order of type B* suffixes. */ |
| 179 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { |
| 180 /* Scan the suffix array from right to left. */ |
| 181 for(i = SA + BUCKET_BSTAR(c1, c1 + 1), |
| 182 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; |
| 183 i <= j; |
| 184 --j) { |
| 185 if(0 < (s = *j)) { |
| 186 assert(T[s] == c1); |
| 187 assert(((s + 1) < n) && (T[s] <= T[s + 1])); |
| 188 assert(T[s - 1] <= T[s]); |
| 189 *j = ~s; |
| 190 c0 = T[--s]; |
| 191 if((0 < s) && (T[s - 1] > c0)) { s = ~s; } |
| 192 if(c0 != c2) { |
| 193 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } |
| 194 k = SA + BUCKET_B(c2 = c0, c1); |
| 195 } |
| 196 assert(k < j); |
| 197 *k-- = s; |
| 198 } else { |
| 199 assert(((s == 0) && (T[s] == c1)) || (s < 0)); |
| 200 *j = ~s; |
| 201 } |
| 202 } |
| 203 } |
| 204 } |
| 205 |
| 206 /* Construct the suffix array by using |
| 207 the sorted order of type B suffixes. */ |
| 208 k = SA + BUCKET_A(c2 = T[n - 1]); |
| 209 *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); |
| 210 /* Scan the suffix array from left to right. */ |
| 211 for(i = SA, j = SA + n; i < j; ++i) { |
| 212 if(0 < (s = *i)) { |
| 213 assert(T[s - 1] >= T[s]); |
| 214 c0 = T[--s]; |
| 215 if((s == 0) || (T[s - 1] < c0)) { s = ~s; } |
| 216 if(c0 != c2) { |
| 217 BUCKET_A(c2) = k - SA; |
| 218 k = SA + BUCKET_A(c2 = c0); |
| 219 } |
| 220 assert(i < k); |
| 221 *k++ = s; |
| 222 } else { |
| 223 assert(s < 0); |
| 224 *i = ~s; |
| 225 } |
| 226 } |
| 227 } |
| 228 |
| 229 } // namespace |
| 230 |
| 231 /*---------------------------------------------------------------------------*/ |
| 232 |
| 233 /*- Function -*/ |
| 234 |
| 235 saint_t |
| 236 divsufsort(const sauchar_t *T, saidx_it SA, saidx_t n) { |
| 237 saidx_t *bucket_A, *bucket_B; |
| 238 saidx_t m; |
| 239 saint_t err = 0; |
| 240 |
| 241 /* Check arguments. */ |
| 242 if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } |
| 243 else if(n == 0) { return 0; } |
| 244 else if(n == 1) { SA[0] = 0; return 0; } |
| 245 else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } |
| 246 |
| 247 bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); |
| 248 bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); |
| 249 |
| 250 /* Suffixsort. */ |
| 251 if((bucket_A != NULL) && (bucket_B != NULL)) { |
| 252 m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); |
| 253 construct_SA(T, SA, bucket_A, bucket_B, n, m); |
| 254 } else { |
| 255 err = -2; |
| 256 } |
| 257 |
| 258 free(bucket_B); |
| 259 free(bucket_A); |
| 260 |
| 261 return err; |
| 262 } |
| 263 |
| 264 saint_t divsufsort_include_empty(const sauchar_t *T, saidx_it SA, saidx_t n) { |
| 265 SA[0] = n; // Manually add the empty string suffix. |
| 266 return divsufsort(T, SA + 1, n); |
| 267 } |
| 268 |
| 269 } // namespace divsuf |
OLD | NEW |