OLD | NEW |
(Empty) | |
| 1 /* |
| 2 bsdiff_search.h -- Suffix array search. |
| 3 |
| 4 Copyright 2003 Colin Percival |
| 5 |
| 6 For the terms under which this work may be distributed, please see |
| 7 the adjoining file "LICENSE". |
| 8 |
| 9 ChangeLog: |
| 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
| 11 values throughout. |
| 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
| 13 2015-08-03 - Change search() to template to allow PagedArray usage. |
| 14 --Samuel Huang <huangs@chromium.org> |
| 15 2015-08-19 - Optimized search(). |
| 16 --Samuel Huang <huangs@chromium.org> |
| 17 2016-06-02 - Move matchlen() and search() to a separate file; comment; format. |
| 18 --Samuel Huang <huangs@chromium.org> |
| 19 */ |
| 20 |
| 21 #include <algorithm> |
| 22 #include <cstring> |
| 23 |
| 24 namespace bsdiff { |
| 25 |
| 26 // Similar to ::memcmp(), but returns match length instead. |
| 27 inline int matchlen(const unsigned char *srcbuf, const unsigned char *keybuf, |
| 28 int size) { |
| 29 for (int i = 0; i < size; ++i) |
| 30 if (srcbuf[i] != keybuf[i]) |
| 31 return i; |
| 32 return size; |
| 33 } |
| 34 |
| 35 // Finds a suffix in |srcbuf| that has the longest common prefix with |keybuf|, |
| 36 // aided by suffix array |I| of |srcbuf|. Returns the length of the match, and |
| 37 // writes the position within |srcbuf| to |pos|. |
| 38 template <class T> |
| 39 int search(T I, const unsigned char *srcbuf, int srcize, |
| 40 const unsigned char *keybuf, int keysize, int *pos) { |
| 41 // We use binary search here, but the desired position is not the usual |
| 42 // position from std::lower_bound()-like binary search. |
| 43 // For example, given sorted suffixes of |srcbuf|: |
| 44 // SAAA... |
| 45 // SNNN... |
| 46 // TAAA... |
| 47 // Applying std::lower_bound() binary search results in: |
| 48 // |keybuf| = "SNBB" => "SNNN...". |
| 49 // |keybuf| = "SNZZ" => "TAAA...". |
| 50 // But the desired result for both cases is "SNNN...", with "SN" as the |
| 51 // longest common prefix. To solve this: |
| 52 // Case 1: |keysize| == 0: We'd return 0, although |pos| is arbitrary. |
| 53 // Case 2: |srcsize| == 0: We'd return 0, and |pos| = 0. |
| 54 // Case 3: |srcsize| > 0 and |keysize| > 0: We'd assert rank-|lo| suffix |
| 55 // of |srcbuf| is always strictly lexicographically less than |keybuf|. This |
| 56 // holds for |lo| = 0, which corresponds to "". The search terminates when |
| 57 // hi == lo + 1. At this point, suffixes at rank |lo| and rank |hi| both |
| 58 // may be the desired match, so we check both. In case of tie, there's NO |
| 59 // guarantee that we take the solution with minimum rank. |
| 60 // Finding the minimum rank solution is not needed, and it would create more |
| 61 // complexity. For example, with suffixes "MA...", "MB...", "MC...", "ME...", |
| 62 // searching look for "MD" would yield |lo| -> "MC...", and the minimum rank |
| 63 // solution "MA...", would be rather far away. |
| 64 // |
| 65 // TODO(huangs): Optimize this: match length at |t| is >= match length at |
| 66 // |lo| and at |hi|. If we track this then we can skip comparisons! |
| 67 int lo = 0; |
| 68 int hi = srcize; |
| 69 while (hi - lo > 1) { |
| 70 int t = (lo + hi) / 2; |
| 71 if (::memcmp(srcbuf + I[t], keybuf, std::min(srcize - I[t], keysize)) < 0) |
| 72 lo = t; |
| 73 else |
| 74 hi = t; |
| 75 } |
| 76 |
| 77 int x = matchlen(srcbuf + I[lo], keybuf, std::min(srcize - I[lo], keysize)); |
| 78 int y = matchlen(srcbuf + I[hi], keybuf, std::min(srcize - I[hi], keysize)); |
| 79 if (x > y) { |
| 80 *pos = I[lo]; |
| 81 return x; |
| 82 } |
| 83 *pos = I[hi]; |
| 84 return y; |
| 85 } |
| 86 |
| 87 } // namespace bsdiff |
OLD | NEW |