| OLD | NEW |
| (Empty) |
| 1 // Copyright 2007 Google Inc. | |
| 2 // Author: Lincoln Smith | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // | |
| 8 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 // | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 | |
| 16 #ifndef OPEN_VCDIFF_VCENCODER_H_ | |
| 17 #define OPEN_VCDIFF_VCENCODER_H_ | |
| 18 | |
| 19 #include <cstddef> // size_t | |
| 20 #include <vector> | |
| 21 #include "google/output_string.h" | |
| 22 | |
| 23 namespace open_vcdiff { | |
| 24 | |
| 25 class VCDiffEngine; | |
| 26 class VCDiffStreamingEncoderImpl; | |
| 27 | |
| 28 // These flags are passed to the constructor of VCDiffStreamingEncoder | |
| 29 // to determine whether certain open-vcdiff format extensions | |
| 30 // (which are not part of the RFC 3284 draft standard for VCDIFF) | |
| 31 // are employed. | |
| 32 // | |
| 33 // Because these extensions are not part of the VCDIFF standard, if | |
| 34 // any of these flags except VCD_STANDARD_FORMAT is specified, then the caller | |
| 35 // must be certain that the receiver of the data will be using open-vcdiff | |
| 36 // to decode the delta file, or at least that the receiver can interpret | |
| 37 // these extensions. The encoder will use an 'S' as the fourth character | |
| 38 // in the delta file to indicate that non-standard extensions are being used. | |
| 39 // | |
| 40 enum VCDiffFormatExtensionFlagValues { | |
| 41 // No extensions: the encoded format will conform to the RFC | |
| 42 // draft standard for VCDIFF. | |
| 43 VCD_STANDARD_FORMAT = 0x00, | |
| 44 // If this flag is specified, then the encoder writes each delta file | |
| 45 // window by interleaving instructions and sizes with their corresponding | |
| 46 // addresses and data, rather than placing these elements | |
| 47 // into three separate sections. This facilitates providing partially | |
| 48 // decoded results when only a portion of a delta file window is received | |
| 49 // (e.g. when HTTP over TCP is used as the transmission protocol.) | |
| 50 VCD_FORMAT_INTERLEAVED = 0x01, | |
| 51 // If this flag is specified, then an Adler32 checksum | |
| 52 // of the target window data is included in the delta window. | |
| 53 VCD_FORMAT_CHECKSUM = 0x02 | |
| 54 }; | |
| 55 | |
| 56 typedef int VCDiffFormatExtensionFlags; | |
| 57 | |
| 58 // A HashedDictionary must be constructed from the dictionary data | |
| 59 // in order to use VCDiffStreamingEncoder. If the same dictionary will | |
| 60 // be used to perform several encoding operations, then the caller should | |
| 61 // create the HashedDictionary once and cache it for reuse. This object | |
| 62 // is thread-safe: the same const HashedDictionary can be used | |
| 63 // by several threads simultaneously, each with its own VCDiffStreamingEncoder. | |
| 64 // | |
| 65 // dictionary_contents is copied into the HashedDictionary, so the | |
| 66 // caller may free that string, if desired, after the constructor returns. | |
| 67 // | |
| 68 class HashedDictionary { | |
| 69 public: | |
| 70 HashedDictionary(const char* dictionary_contents, | |
| 71 size_t dictionary_size); | |
| 72 ~HashedDictionary(); | |
| 73 | |
| 74 // Init() must be called before using the HashedDictionary as an argument | |
| 75 // to the VCDiffStreamingEncoder, or for any other purpose except | |
| 76 // destruction. It returns true if initialization succeeded, or false | |
| 77 // if an error occurred, in which case the caller should destroy the object | |
| 78 // without using it. | |
| 79 bool Init(); | |
| 80 | |
| 81 const VCDiffEngine* engine() const { return engine_; } | |
| 82 | |
| 83 private: | |
| 84 const VCDiffEngine* engine_; | |
| 85 }; | |
| 86 | |
| 87 // The standard streaming interface to the VCDIFF (RFC 3284) encoder. | |
| 88 // "Streaming" in this context means that, even though the entire set of | |
| 89 // input data to be encoded may not be available at once, the encoder | |
| 90 // can produce partial output based on what is available. Of course, | |
| 91 // the caller should try to maximize the sizes of the data chunks passed | |
| 92 // to the encoder. | |
| 93 class VCDiffStreamingEncoder { | |
| 94 public: | |
| 95 // The HashedDictionary object passed to the constructor must remain valid, | |
| 96 // without being deleted, for the lifetime of the VCDiffStreamingEncoder | |
| 97 // object. | |
| 98 // | |
| 99 // format_extensions allows certain open-vcdiff extensions to the VCDIFF | |
| 100 // format to be included in the encoded output. These extensions are not | |
| 101 // part of the RFC 3284 draft standard, so specifying any extension flags | |
| 102 // will make the output compatible only with open-vcdiff, or with other | |
| 103 // VCDIFF implementations that accept these extensions. See above for an | |
| 104 // explanation of each possible flag value. | |
| 105 // | |
| 106 // *** look_for_target_matches: | |
| 107 // The VCDIFF format allows COPY instruction addresses to reference data from | |
| 108 // the source (dictionary), or from previously encoded target data. | |
| 109 // | |
| 110 // If look_for_target_matches is false, then the encoder will only | |
| 111 // produce COPY instructions that reference source data from the dictionary, | |
| 112 // never from previously encoded target data. This will speed up the encoding | |
| 113 // process, but the encoded data will not be as compact. | |
| 114 // | |
| 115 // If this value is true, then the encoder will produce COPY instructions | |
| 116 // that reference either source data or target data. A COPY instruction from | |
| 117 // the previously encoded target data may even extend into the range of the | |
| 118 // data being produced by that same COPY instruction; for example, if the | |
| 119 // previously encoded target data is "LA", then a single COPY instruction of | |
| 120 // length 10 can produce the additional target data "LALALALALA". | |
| 121 // | |
| 122 // There is a third type of COPY instruction that starts within | |
| 123 // the source data and extends from the end of the source data | |
| 124 // into the beginning of the target data. This VCDIFF encoder will never | |
| 125 // produce a COPY instruction of this third type (regardless of the value of | |
| 126 // look_for_target_matches) because the cost of checking for matches | |
| 127 // across the source-target boundary would not justify its benefits. | |
| 128 // | |
| 129 VCDiffStreamingEncoder(const HashedDictionary* dictionary, | |
| 130 VCDiffFormatExtensionFlags format_extensions, | |
| 131 bool look_for_target_matches); | |
| 132 ~VCDiffStreamingEncoder(); | |
| 133 | |
| 134 // The client should use these routines as follows: | |
| 135 // HashedDictionary hd(dictionary, dictionary_size); | |
| 136 // if (!hd.Init()) { | |
| 137 // HandleError(); | |
| 138 // return; | |
| 139 // } | |
| 140 // string output_string; | |
| 141 // VCDiffStreamingEncoder v(hd, false, false); | |
| 142 // if (!v.StartEncoding(&output_string)) { | |
| 143 // HandleError(); | |
| 144 // return; // No need to call FinishEncoding() | |
| 145 // } | |
| 146 // Process(output_string.data(), output_string.size()); | |
| 147 // output_string.clear(); | |
| 148 // while (get data_buf) { | |
| 149 // if (!v.EncodeChunk(data_buf, data_len, &output_string)) { | |
| 150 // HandleError(); | |
| 151 // return; // No need to call FinishEncoding() | |
| 152 // } | |
| 153 // // The encoding is appended to output_string at each call, | |
| 154 // // so clear output_string once its contents have been processed. | |
| 155 // Process(output_string.data(), output_string.size()); | |
| 156 // output_string.clear(); | |
| 157 // } | |
| 158 // if (!v.FinishEncoding(&output_string)) { | |
| 159 // HandleError(); | |
| 160 // return; | |
| 161 // } | |
| 162 // Process(output_string.data(), output_string.size()); | |
| 163 // output_string.clear(); | |
| 164 // | |
| 165 // I.e., the allowed pattern of calls is | |
| 166 // StartEncoding EncodeChunk* FinishEncoding | |
| 167 // | |
| 168 // The size of the encoded output depends on the sizes of the chunks | |
| 169 // passed in (i.e. the chunking boundary affects compression). | |
| 170 // However the decoded output is independent of chunk boundaries. | |
| 171 | |
| 172 // Sets up the data structures for encoding. | |
| 173 // Writes a VCDIFF delta file header (as defined in RFC section 4.1) | |
| 174 // to *output_string. | |
| 175 // | |
| 176 // Note: we *append*, so the old contents of *output_string stick around. | |
| 177 // This convention differs from the non-streaming Encode/Decode | |
| 178 // interfaces in VCDiffEncoder. | |
| 179 // | |
| 180 // If an error occurs, this function returns false; otherwise it returns true. | |
| 181 // If this function returns false, the caller does not need to call | |
| 182 // FinishEncoding or to do any cleanup except destroying the | |
| 183 // VCDiffStreamingEncoder object. | |
| 184 template<class OutputType> | |
| 185 bool StartEncoding(OutputType* output) { | |
| 186 OutputString<OutputType> output_string(output); | |
| 187 return StartEncodingToInterface(&output_string); | |
| 188 } | |
| 189 | |
| 190 bool StartEncodingToInterface(OutputStringInterface* output_string); | |
| 191 | |
| 192 // Appends compressed encoding for "data" (one complete VCDIFF delta window) | |
| 193 // to *output_string. | |
| 194 // If an error occurs (for example, if StartEncoding was not called | |
| 195 // earlier or StartEncoding returned false), this function returns false; | |
| 196 // otherwise it returns true. The caller does not need to call FinishEncoding | |
| 197 // or do any cleanup except destroying the VCDiffStreamingEncoder | |
| 198 // if this function returns false. | |
| 199 template<class OutputType> | |
| 200 bool EncodeChunk(const char* data, size_t len, OutputType* output) { | |
| 201 OutputString<OutputType> output_string(output); | |
| 202 return EncodeChunkToInterface(data, len, &output_string); | |
| 203 } | |
| 204 | |
| 205 bool EncodeChunkToInterface(const char* data, size_t len, | |
| 206 OutputStringInterface* output_string); | |
| 207 | |
| 208 // Finishes encoding and appends any leftover encoded data to *output_string. | |
| 209 // If an error occurs (for example, if StartEncoding was not called | |
| 210 // earlier or StartEncoding returned false), this function returns false; | |
| 211 // otherwise it returns true. The caller does not need to | |
| 212 // do any cleanup except destroying the VCDiffStreamingEncoder | |
| 213 // if this function returns false. | |
| 214 template<class OutputType> | |
| 215 bool FinishEncoding(OutputType* output) { | |
| 216 OutputString<OutputType> output_string(output); | |
| 217 return FinishEncodingToInterface(&output_string); | |
| 218 } | |
| 219 | |
| 220 bool FinishEncodingToInterface(OutputStringInterface* output_string); | |
| 221 | |
| 222 // Replaces the contents of match_counts with a vector of integers, | |
| 223 // one for each possible match length. The value of match_counts[n] | |
| 224 // is equal to the number of matches of length n found so far | |
| 225 // for this VCDiffStreamingEncoder object. | |
| 226 void GetMatchCounts(std::vector<int>* match_counts) const; | |
| 227 | |
| 228 private: | |
| 229 VCDiffStreamingEncoderImpl* const impl_; | |
| 230 | |
| 231 // Make the copy constructor and assignment operator private | |
| 232 // so that they don't inadvertently get used. | |
| 233 VCDiffStreamingEncoder(const VCDiffStreamingEncoder&); // NOLINT | |
| 234 void operator=(const VCDiffStreamingEncoder&); | |
| 235 }; | |
| 236 | |
| 237 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used | |
| 238 // if the entire target data string is available. | |
| 239 // | |
| 240 class VCDiffEncoder { | |
| 241 public: | |
| 242 VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size) | |
| 243 : dictionary_(dictionary_contents, dictionary_size), | |
| 244 encoder_(NULL), | |
| 245 flags_(VCD_STANDARD_FORMAT) { } | |
| 246 | |
| 247 ~VCDiffEncoder() { | |
| 248 delete encoder_; | |
| 249 } | |
| 250 | |
| 251 // By default, VCDiffEncoder uses standard VCDIFF format. This function | |
| 252 // can be used before calling Encode(), to specify that interleaved format | |
| 253 // and/or checksum format should be used. | |
| 254 void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; } | |
| 255 | |
| 256 // Replaces old contents of output_string with the encoded form of | |
| 257 // target_data. | |
| 258 template<class OutputType> | |
| 259 bool Encode(const char* target_data, | |
| 260 size_t target_len, | |
| 261 OutputType* output) { | |
| 262 OutputString<OutputType> output_string(output); | |
| 263 return EncodeToInterface(target_data, target_len, &output_string); | |
| 264 } | |
| 265 | |
| 266 private: | |
| 267 // Always look for matches in both source and target. This default value | |
| 268 // can be changed in this code if desired. | |
| 269 static const bool look_for_target_matches_ = true; | |
| 270 | |
| 271 bool EncodeToInterface(const char* target_data, | |
| 272 size_t target_len, | |
| 273 OutputStringInterface* output_string); | |
| 274 | |
| 275 HashedDictionary dictionary_; | |
| 276 VCDiffStreamingEncoder* encoder_; | |
| 277 VCDiffFormatExtensionFlags flags_; | |
| 278 | |
| 279 // Make the copy constructor and assignment operator private | |
| 280 // so that they don't inadvertently get used. | |
| 281 VCDiffEncoder(const VCDiffEncoder&); // NOLINT | |
| 282 void operator=(const VCDiffEncoder&); | |
| 283 }; | |
| 284 | |
| 285 } // namespace open_vcdiff | |
| 286 | |
| 287 #endif // OPEN_VCDIFF_VCENCODER_H_ | |
| OLD | NEW |