| OLD | NEW |
| 1 /* | 1 /* |
| 2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
| 3 | 3 |
| 4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
| 5 | 5 |
| 6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
| 7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
| 8 | 8 |
| 9 ChangeLog: | 9 ChangeLog: |
| 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
| 11 values throughout. | 11 values throughout. |
| 12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
| 13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
| 14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
| 15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
| 16 2015-08-03 - Extrat qsufsort to a separate file as template. | 16 2015-08-03 - Extract qsufsort to a separate file as template. |
| 17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
| 18 2015-08-19 - Optimize split() and search(), add comments. | 18 2015-08-19 - Optimize split() and search(), add comments. |
| 19 --Samuel Huang <huangs@chromium.org> | 19 --Samuel Huang <huangs@chromium.org> |
| 20 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution. |
| 21 --Samuel Huang <huangs@chromium.org> |
| 20 */ | 22 */ |
| 21 | 23 |
| 22 #include <algorithm> | 24 #include <algorithm> |
| 23 #include <cstring> | 25 #include <cstring> |
| 24 | 26 |
| 25 namespace courgette { | 27 namespace courgette { |
| 26 namespace qsuf { | 28 namespace qsuf { |
| 27 | 29 |
| 28 // ------------------------------------------------------------------------ | 30 // ------------------------------------------------------------------------ |
| 29 // | 31 // |
| 30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 32 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
| 31 // code formatting and variable names. The changes from the original are: | 33 // code formatting and variable names. The changes from the original are: |
| 32 // (1) replacing tabs with spaces, | 34 // (1) replacing tabs with spaces, |
| 33 // (2) indentation, | 35 // (2) indentation, |
| 34 // (3) using 'const', | 36 // (3) using 'const', |
| 35 // (4) changing the V and I parameters from int* to template <typename T>. | 37 // (4) changing the V and I parameters from int* to template <typename T>. |
| 36 // (5) optimizing split() and search(); fix styles. | 38 // (5) optimizing search(); fix styles. |
| 39 // (6) rewriting split() to use std::sort(), followed by I, V resolution. |
| 37 // | 40 // |
| 38 // The code appears to be a rewritten version of the suffix array algorithm | 41 // The code appears to be a rewritten version of the suffix array algorithm |
| 39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 42 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
| 40 // Sadakane, special cased for bytes. | 43 // Sadakane, special cased for bytes. |
| 41 | 44 |
| 45 // Before: V[I[start:end]] = end - 1. |
| 46 // After: V[I[start:end]] take values in [start, end), forming "groups" of |
| 47 // suffixes that are unresolved up to length |h|. But if a group has size 1, |
| 48 // i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on |
| 49 // the next iteration we'd skip it. |
| 42 template <typename T> static void split(T I, T V, int start, int end, int h) { | 50 template <typename T> static void split(T I, T V, int start, int end, int h) { |
| 43 // For small interval, apply selection sort. | 51 if (start >= end) |
| 44 if (end - start < 6) { | 52 return; |
| 45 for (int i = start; i < end; ) { | 53 |
| 46 int skip = 1; | 54 // Sort I[start:end] using V[I[start:end] + h] as key. |
| 47 int best = V[I[i] + h]; | 55 std::sort(I.begin() + start, I.begin() + end, |
| 48 for (int j = i + 1; j < end; j++) { | 56 [&](int a, int b) { return V[a + h] < V[b + h]; }); |
| 49 int cur = V[I[j] + h]; | 57 |
| 50 if (best > cur) { | 58 // V[I[start:end] + h] is sorted. [start,end) can be split into distinct |
| 51 best = cur; | 59 // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b), |
| 52 int tmp = I[i]; | 60 // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign |
| 53 I[i] = I[j]; | 61 // I[a] := -1 to indicate completion. |
| 54 I[j] = tmp; | 62 int group_last_member = end - 1; |
| 55 skip = 1; | 63 int group_key = V[I[group_last_member] + h]; |
| 56 } else if (best == cur) { | 64 |
| 57 int tmp = I[i + skip]; | 65 // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass. |
| 58 I[i + skip] = I[j]; | 66 for (int i = end - 2; i >= start; --i) { |
| 59 I[j] = tmp; | 67 int cur_key = V[I[i] + h]; |
| 60 ++skip; | 68 |
| 61 } | 69 // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since |
| 62 } | 70 // we're reassigning all V[] values of |end - 1| to value in [begin, end) |
| 63 if (skip == 1) { | 71 // (that never appear elsewhere), so if we find anything in [begin, end), |
| 64 V[I[i]] = i; | 72 // we know it used to be |end - 1|, and can reinterpret accordingly. |
| 65 I[i] = -1; | 73 if (cur_key >= start && cur_key < end) |
| 66 } else { | 74 cur_key = end - 1; |
| 67 for (int j = i, jend = i + skip; j < jend; j++) | 75 |
| 68 V[I[j]] = jend - 1; | 76 // Did we find new group? |
| 69 } | 77 if (group_key != cur_key) { |
| 70 i += skip; | 78 // If previous group found has 1 member, then mark completion. |
| 79 if (group_last_member == i + 1) |
| 80 I[i + 1] = -1; |
| 81 group_last_member = i; |
| 82 group_key = cur_key; |
| 71 } | 83 } |
| 72 return; | 84 V[I[i]] = group_last_member; |
| 73 } | 85 } |
| 74 | 86 |
| 75 // Split [start, end) into 3 intervals: | 87 if (group_last_member == start) |
| 76 // [start, j) with secondary keys < pivot, | 88 I[start] = -1; |
| 77 // [j, k) with secondary keys == pivot, | |
| 78 // [k, end) with secondary keys > pivot. | |
| 79 int pivot = V[I[(start + end) / 2] + h]; | |
| 80 int j = start; | |
| 81 int k = end; | |
| 82 for (int i = start; i < k; ) { | |
| 83 int cur = V[I[i] + h]; | |
| 84 if (cur < pivot) { | |
| 85 if (i != j) { | |
| 86 int tmp = I[i]; | |
| 87 I[i] = I[j]; | |
| 88 I[j] = tmp; | |
| 89 } | |
| 90 ++i; | |
| 91 ++j; | |
| 92 } else if (cur > pivot) { | |
| 93 --k; | |
| 94 int tmp = I[i]; | |
| 95 I[i] = I[k]; | |
| 96 I[k] = tmp; | |
| 97 } else { | |
| 98 ++i; | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 // Recurse on the "< pivot" piece. | |
| 103 if (start < j) | |
| 104 split<T>(I, V, start, j, h); | |
| 105 | |
| 106 // Update the "== pivot" piece. | |
| 107 if (j == k - 1) { | |
| 108 V[I[j]] = j; | |
| 109 I[j] = -1; | |
| 110 } else { | |
| 111 for (int i = j; i < k; ++i) | |
| 112 V[I[i]] = k - 1; | |
| 113 } | |
| 114 | |
| 115 // Recurse on the "> pivot" piece. | |
| 116 if (k < end) | |
| 117 split<T>(I, V, k, end, h); | |
| 118 } | 89 } |
| 119 | 90 |
| 120 template <class T> | 91 template <class T> |
| 121 static void | 92 static void |
| 122 qsufsort(T I, T V,const unsigned char *old,int oldsize) | 93 qsufsort(T I, T V,const unsigned char *old,int oldsize) |
| 123 { | 94 { |
| 124 int buckets[256]; | 95 int buckets[256]; |
| 125 int i,h,len; | 96 int i,h,len; |
| 126 | 97 |
| 127 for(i=0;i<256;i++) buckets[i]=0; | 98 for(i=0;i<256;i++) buckets[i]=0; |
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 190 } | 161 } |
| 191 *pos = I[hi]; | 162 *pos = I[hi]; |
| 192 return y; | 163 return y; |
| 193 } | 164 } |
| 194 | 165 |
| 195 // End of 'verbatim' code. | 166 // End of 'verbatim' code. |
| 196 // ------------------------------------------------------------------------ | 167 // ------------------------------------------------------------------------ |
| 197 | 168 |
| 198 } // namespace qsuf | 169 } // namespace qsuf |
| 199 } // namespace courgette | 170 } // namespace courgette |
| OLD | NEW |