courgette/third_party/qsufsort.h - Issue 1914303002: [Courgette] Fix sorting in QSufSort's split().

Side by Side Diff: courgette/third_party/qsufsort.h

Issue 1914303002: [Courgette] Fix sorting in QSufSort's split(). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 qsufsort.h -- Suffix array generation.	2 qsufsort.h -- Suffix array generation.

3	3

4 Copyright 2003 Colin Percival	4 Copyright 2003 Colin Percival

5	5

6 For the terms under which this work may be distributed, please see	6 For the terms under which this work may be distributed, please see

7 the adjoining file "LICENSE".	7 the adjoining file "LICENSE".

8	8

9 ChangeLog:	9 ChangeLog:

10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit	10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit

11 values throughout.	11 values throughout.

12 --Benjamin Smedberg <benjamin@smedbergs.us>	12 --Benjamin Smedberg <benjamin@smedbergs.us>

13 2010-05-26 - Use a paged array for V and I. The address space may be too	13 2010-05-26 - Use a paged array for V and I. The address space may be too

14 fragmented for these big arrays to be contiguous.	14 fragmented for these big arrays to be contiguous.

15 --Stephen Adams <sra@chromium.org>	15 --Stephen Adams <sra@chromium.org>

16 2015-08-03 - Extrat qsufsort to a separate file as template.	16 2015-08-03 - Extract qsufsort to a separate file as template.

17 --Samuel Huang <huangs@chromium.org>	17 --Samuel Huang <huangs@chromium.org>

18 2015-08-19 - Optimize split() and search(), add comments.	18 2015-08-19 - Optimize split() and search(), add comments.

19 --Samuel Huang <huangs@chromium.org>	19 --Samuel Huang <huangs@chromium.org>

	20 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution.

	21 --Samuel Huang <huangs@chromium.org>

20 */	22 */

21	23

22 #include <algorithm>	24 #include <algorithm>

23 #include <cstring>	25 #include <cstring>

24	26

25 namespace courgette {	27 namespace courgette {

26 namespace qsuf {	28 namespace qsuf {

27	29

28 // ------------------------------------------------------------------------	30 // ------------------------------------------------------------------------

29 //	31 //

30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the	32 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the

31 // code formatting and variable names. The changes from the original are:	33 // code formatting and variable names. The changes from the original are:

32 // (1) replacing tabs with spaces,	34 // (1) replacing tabs with spaces,

33 // (2) indentation,	35 // (2) indentation,

34 // (3) using 'const',	36 // (3) using 'const',

35 // (4) changing the V and I parameters from int* to template <typename T>.	37 // (4) changing the V and I parameters from int* to template <typename T>.

36 // (5) optimizing split() and search(); fix styles.	38 // (5) optimizing search(); fix styles.

	39 // (6) rewriting split() to use std::sort(), followed by I, V resolution.

37 //	40 //

38 // The code appears to be a rewritten version of the suffix array algorithm	41 // The code appears to be a rewritten version of the suffix array algorithm

39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko	42 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko

40 // Sadakane, special cased for bytes.	43 // Sadakane, special cased for bytes.

41	44

	45 // Before: V[I[start:end]] = end - 1.

	46 // After: V[I[start:end]] take values in [start, end), forming "groups" of

	47 // suffixes that are unresolved up to length \|h\|. But if a group has size 1,

	48 // i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on

	49 // the next iteration we'd skip it.

42 template <typename T> static void split(T I, T V, int start, int end, int h) {	50 template <typename T> static void split(T I, T V, int start, int end, int h) {

43 // For small interval, apply selection sort.	51 if (start >= end)

44 if (end - start < 6) {	52 return;

45 for (int i = start; i < end; ) {	53

46 int skip = 1;	54 // Sort I[start:end] using V[I[start:end] + h] as key.

47 int best = V[I[i] + h];	55 std::sort(I.begin() + start, I.begin() + end,

48 for (int j = i + 1; j < end; j++) {	56 [&](int a, int b) { return V[a + h] < V[b + h]; });

49 int cur = V[I[j] + h];	57

50 if (best > cur) {	58 // V[I[start:end] + h] is sorted. [start,end) can be split into distinct

51 best = cur;	59 // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b),

52 int tmp = I[i];	60 // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign

53 I[i] = I[j];	61 // I[a] := -1 to indicate completion.

54 I[j] = tmp;	62 int group_last_member = end - 1;

55 skip = 1;	63 int group_key = V[I[group_last_member] + h];

56 } else if (best == cur) {	64

57 int tmp = I[i + skip];	65 // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass.

58 I[i + skip] = I[j];	66 for (int i = end - 2; i >= start; --i) {

59 I[j] = tmp;	67 int cur_key = V[I[i] + h];

60 ++skip;	68

61 }	69 // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since

62 }	70 // we're reassigning all V[] values of \|end - 1\| to value in [begin, end)

63 if (skip == 1) {	71 // (that never appear elsewhere), so if we find anything in [begin, end),

64 V[I[i]] = i;	72 // we know it used to be \|end - 1\|, and can reinterpret accordingly.

65 I[i] = -1;	73 if (cur_key >= start && cur_key < end)

66 } else {	74 cur_key = end - 1;

67 for (int j = i, jend = i + skip; j < jend; j++)	75

68 V[I[j]] = jend - 1;	76 // Did we find new group?

69 }	77 if (group_key != cur_key) {

70 i += skip;	78 // If previous group found has 1 member, then mark completion.

	79 if (group_last_member == i + 1)

	80 I[i + 1] = -1;

	81 group_last_member = i;

	82 group_key = cur_key;

71 }	83 }

72 return;	84 V[I[i]] = group_last_member;

73 }	85 }

74	86

75 // Split [start, end) into 3 intervals:	87 if (group_last_member == start)

76 // [start, j) with secondary keys < pivot,	88 I[start] = -1;

77 // [j, k) with secondary keys == pivot,

78 // [k, end) with secondary keys > pivot.

79 int pivot = V[I[(start + end) / 2] + h];

80 int j = start;

81 int k = end;

82 for (int i = start; i < k; ) {

83 int cur = V[I[i] + h];

84 if (cur < pivot) {

85 if (i != j) {

86 int tmp = I[i];

87 I[i] = I[j];

88 I[j] = tmp;

89 }

90 ++i;

91 ++j;

92 } else if (cur > pivot) {

93 --k;

94 int tmp = I[i];

95 I[i] = I[k];

96 I[k] = tmp;

97 } else {

98 ++i;

99 }

100 }

101

102 // Recurse on the "< pivot" piece.

103 if (start < j)

104 split<T>(I, V, start, j, h);

105

106 // Update the "== pivot" piece.

107 if (j == k - 1) {

108 V[I[j]] = j;

109 I[j] = -1;

110 } else {

111 for (int i = j; i < k; ++i)

112 V[I[i]] = k - 1;

113 }

114

115 // Recurse on the "> pivot" piece.

116 if (k < end)

117 split<T>(I, V, k, end, h);

118 }	89 }

119	90

120 template <class T>	91 template <class T>

121 static void	92 static void

122 qsufsort(T I, T V,const unsigned char *old,int oldsize)	93 qsufsort(T I, T V,const unsigned char *old,int oldsize)

123 {	94 {

124 int buckets[256];	95 int buckets[256];

125 int i,h,len;	96 int i,h,len;

126	97

127 for(i=0;i<256;i++) buckets[i]=0;	98 for(i=0;i<256;i++) buckets[i]=0;

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
190 }	161 }

191 *pos = I[hi];	162 *pos = I[hi];

192 return y;	163 return y;

193 }	164 }

194	165

195 // End of 'verbatim' code.	166 // End of 'verbatim' code.

196 // ------------------------------------------------------------------------	167 // ------------------------------------------------------------------------

197	168

198 } // namespace qsuf	169 } // namespace qsuf

199 } // namespace courgette	170 } // namespace courgette

OLD	NEW

« no previous file with comments | « courgette/third_party/paged_array_unittest.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »