components/safe_browsing_db/v4_rice.cc - Issue 2228393003: PVer4: DecodeHashes needs to sort the output of the Rice decoder

Side by Side Diff: components/safe_browsing_db/v4_rice.cc

Issue 2228393003: PVer4: DecodeHashes needs to sort the output of the Rice decoder (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@01_checksum

Patch Set: Use htonl. Return from GetBitsFromCurrentWord Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

	5 #include <algorithm>

	6 #include <vector>

	7

5 #include "base/logging.h"	8 #include "base/logging.h"

6 #include "base/numerics/safe_math.h"	9 #include "base/numerics/safe_math.h"

7 #include "base/strings/stringprintf.h"	10 #include "base/strings/stringprintf.h"

	11 #include "build/build_config.h"

8 #include "components/safe_browsing_db/v4_rice.h"	12 #include "components/safe_browsing_db/v4_rice.h"

9	13

	14 #if defined(OS_WIN)

	15 #include <winsock2.h>

	16 #elif defined(OS_POSIX)

	17 #include <arpa/inet.h>

	18 #endif

	19

10 using ::google::protobuf::RepeatedField;	20 using ::google::protobuf::RepeatedField;

11 using ::google::protobuf::int32;	21 using ::google::protobuf::int32;

	22 using ::google::protobuf::int64;

	23

	24 #if !defined(ARCH_CPU_LITTLE_ENDIAN) \|\| (ARCH_CPU_LITTLE_ENDIAN != 1)

	25 #error The code below assumes little-endianness.

	26 #endif
	Scott Hess - ex-Googler 2016/08/17 22:42:27 Is this still necessary now that it's using htonl? Is this still necessary now that it's using htonl? vakh (use Gerrit instead) 2016/08/17 23:40:07 To be frank, I won't be 100% sure until I observe Show quoted text On 2016/08/17 at 22:42:27, Scott Hess wrote: > Is this still necessary now that it's using htonl? To be frank, I won't be 100% sure until I observe it. I'd much rather it fails loud and then someone can make it work, than it fails silently and no one notices.
12	27

13 namespace safe_browsing {	28 namespace safe_browsing {

14	29

15 namespace {	30 namespace {

16	31

17 const int kBitsPerByte = 8;	32 const int kBitsPerByte = 8;

18 const unsigned int kMaxBitIndex = kBitsPerByte * sizeof(uint32_t);	33 const unsigned int kMaxBitIndex = kBitsPerByte * sizeof(uint32_t);

19	34

20 void GetBytesFromUInt32(uint32_t word, char* bytes) {

21 const size_t mask = 0xFF;

22 bytes[0] = (char)(word & mask);

23 bytes[1] = (char)((word >> 8) & mask);

24 bytes[2] = (char)((word >> 16) & mask);

25 bytes[3] = (char)((word >> 24) & mask);

26 }

27

28 } // namespace	35 } // namespace

29	36

30 // static	37 // static

31 V4DecodeResult V4RiceDecoder::ValidateInput(const int32 rice_parameter,	38 V4DecodeResult V4RiceDecoder::ValidateInput(const int32 rice_parameter,

32 const int32 num_entries,	39 const int32 num_entries,

33 const std::string& encoded_data) {	40 const std::string& encoded_data) {

34 if (num_entries < 0) {	41 if (num_entries < 0) {

35 NOTREACHED();	42 NOTREACHED();

36 return NUM_ENTRIES_NEGATIVE_FAILURE;	43 return NUM_ENTRIES_NEGATIVE_FAILURE;

37 }	44 }

38	45

39 if (num_entries == 0) {	46 if (num_entries == 0) {

40 return DECODE_SUCCESS;	47 return DECODE_SUCCESS;

41 }	48 }

42	49

43 if (rice_parameter <= 0) {	50 if (rice_parameter <= 0) {

44 NOTREACHED();	51 NOTREACHED();

45 return RICE_PARAMETER_NON_POSITIVE_FAILURE;	52 return RICE_PARAMETER_NON_POSITIVE_FAILURE;

46 }	53 }

47	54

48 if (encoded_data.empty()) {	55 if (encoded_data.empty()) {

49 NOTREACHED();	56 NOTREACHED();

50 return ENCODED_DATA_UNEXPECTED_EMPTY_FAILURE;	57 return ENCODED_DATA_UNEXPECTED_EMPTY_FAILURE;

51 }	58 }

52	59

53 return DECODE_SUCCESS;	60 return DECODE_SUCCESS;

54 }	61 }

55	62

56 // static	63 // static

57 V4DecodeResult V4RiceDecoder::DecodeIntegers(const int32 first_value,	64 V4DecodeResult V4RiceDecoder::DecodeIntegers(const int64 first_value,

58 const int32 rice_parameter,	65 const int32 rice_parameter,

59 const int32 num_entries,	66 const int32 num_entries,

60 const std::string& encoded_data,	67 const std::string& encoded_data,

61 RepeatedField<int32>* out) {	68 RepeatedField<int32>* out) {

62 DCHECK(out);	69 DCHECK(out);

63	70

64 V4DecodeResult result =	71 V4DecodeResult result =

65 ValidateInput(rice_parameter, num_entries, encoded_data);	72 ValidateInput(rice_parameter, num_entries, encoded_data);

66 if (result != DECODE_SUCCESS) {	73 if (result != DECODE_SUCCESS) {

67 return result;	74 return result;

(...skipping 20 matching lines...) Expand all Loading...
88 return DECODED_INTEGER_OVERFLOW_FAILURE;	95 return DECODED_INTEGER_OVERFLOW_FAILURE;

89 }	96 }

90	97

91 out->Add(last_value.ValueOrDie());	98 out->Add(last_value.ValueOrDie());

92 }	99 }

93	100

94 return DECODE_SUCCESS;	101 return DECODE_SUCCESS;

95 }	102 }

96	103

97 // static	104 // static

98 V4DecodeResult V4RiceDecoder::DecodeBytes(const int32 first_value,	105 V4DecodeResult V4RiceDecoder::DecodeBytes(const int64 first_value,

99 const int32 rice_parameter,	106 const int32 rice_parameter,

100 const int32 num_entries,	107 const int32 num_entries,

101 const std::string& encoded_data,	108 const std::string& encoded_data,

102 std::string* out) {	109 std::vector<uint32_t>* out) {

103 DCHECK(out);	110 DCHECK(out);

104	111

105 V4DecodeResult result =	112 V4DecodeResult result =

106 ValidateInput(rice_parameter, num_entries, encoded_data);	113 ValidateInput(rice_parameter, num_entries, encoded_data);

107 if (result != DECODE_SUCCESS) {	114 if (result != DECODE_SUCCESS) {

108 return result;	115 return result;

109 }	116 }

110 out->reserve((num_entries + 1) * 4);	117 out->reserve((num_entries + 1));

111	118

112 // Cast to unsigned since we don't look at the sign bit as a sign bit.	119 base::CheckedNumeric<uint32_t> last_value(first_value);

113 // first_value should have been an unsigned to begin with but proto don't	120 out->push_back(htonl(last_value.ValueOrDie()));

114 // allow that.

115 uint32_t first_value_unsigned = static_cast<uint32_t>(first_value);

116 base::CheckedNumeric<uint32_t> last_value(first_value_unsigned);

117 char bytes[4];

118 GetBytesFromUInt32(last_value.ValueOrDie(), bytes);

119 out->append(bytes, 4);

120	121

121 if (num_entries == 0) {	122 if (num_entries > 0) {

122 return DECODE_SUCCESS;	123 V4RiceDecoder decoder(rice_parameter, num_entries, encoded_data);

	124 while (decoder.HasAnotherValue()) {

	125 uint32_t offset;

	126 result = decoder.GetNextValue(&offset);

	127 if (result != DECODE_SUCCESS) {

	128 return result;

	129 }

	130

	131 last_value += offset;

	132 if (!last_value.IsValid()) {

	133 NOTREACHED();

	134 return DECODED_INTEGER_OVERFLOW_FAILURE;

	135 }

	136

	137 // This flipping is done so that the decoded uint32 is interpreted

	138 // correcly as a string of 4 bytes.

	139 out->push_back(htonl(last_value.ValueOrDie()));

	140 }

123 }	141 }

124	142

125 V4RiceDecoder decoder(rice_parameter, num_entries, encoded_data);	143 // Flipping the bytes, as done above, destroys the sort order. Sort the

126 while (decoder.HasAnotherValue()) {	144 // values back.

127 uint32_t offset;	145 std::sort(out->begin(), out->end());
	Scott Hess - ex-Googler 2016/08/17 22:42:26 I think at least part of the confusion is that the I think at least part of the confusion is that there's a sorting constraint which is being explained from the bottom up (comments probably added as you discovered things), and maybe it would be cleaner if the header comment for this function described how the results should be sorted. AFAICT, the sort order is because someone elsewhere is going to interpret the raw vector::data() bytes. I'm not making a value judgement on _that_, I'm just saying that if you're going to bake that assumption in here, then you need to describe it because it's a completely non-obvious requirement for someone looking at this code. That said ... I'm also not entirely convinced that this should return a vector of integers, given that there's this odd sorting constraint and someone else is accessing it as raw data. If the only reason for the vector is for unit tests, then IMHO it would be reasonable for the unit tests to decode/flip things for purposes of testing. vakh (use Gerrit instead) 2016/08/17 23:40:07 Done. Let me know if that isn't clear enough for a Show quoted text On 2016/08/17 at 22:42:26, Scott Hess wrote: > I think at least part of the confusion is that there's a sorting constraint which is being explained from the bottom up (comments probably added as you discovered things), and maybe it would be cleaner if the header comment for this function described how the results should be sorted. > Done. Let me know if that isn't clear enough for a new reader. Show quoted text > AFAICT, the sort order is because someone elsewhere is going to interpret the raw vector::data() bytes. I'm not making a value judgement on _that_, I'm just saying that if you're going to bake that assumption in here, then you need to describe it because it's a completely non-obvious requirement for someone looking at this code. > Done. Show quoted text > That said ... I'm also not entirely convinced that this should return a vector of integers, given that there's this odd sorting constraint and someone else is accessing it as raw data. If the only reason for the vector is for unit tests, then IMHO it would be reasonable for the unit tests to decode/flip things for purposes of testing. The reason it returns a vector is that I am trying to interpret the same memory block some times as std::vector<uint32_t> and other times as std::string. The other way to do this would be to create a std::string, reserve its size appropriately, and then re-interpret that block of memory as vector<uint32_t>. The problem with that approach is that string::data() returns a const char * pointer, so I cannot modify the memory underneath that string. Of course I could cast away the constness but that's probably not a good idea either. So I am between a rock and a hard place.
128 result = decoder.GetNextValue(&offset);

129 if (result != DECODE_SUCCESS) {

130 return result;

131 }

132	146

133 last_value += offset;	147 // This flipping is done so that when the vector is interpreted as a string,

134 if (!last_value.IsValid()) {	148 // the bytes are in the correct order.

135 NOTREACHED();	149 for (unsigned i = 0; i < out->size(); i++) {

136 return DECODED_INTEGER_OVERFLOW_FAILURE;	150 (out)[i] = ntohl((out)[i]);

137 }

138

139 GetBytesFromUInt32(last_value.ValueOrDie(), bytes);

140 out->append(bytes, 4);

141 }	151 }

142	152

143 return DECODE_SUCCESS;	153 return DECODE_SUCCESS;

144 }	154 }

145	155

146 V4RiceDecoder::V4RiceDecoder(const int rice_parameter,	156 V4RiceDecoder::V4RiceDecoder(const int rice_parameter,

147 const int num_entries,	157 const int num_entries,

148 const std::string& encoded_data)	158 const std::string& encoded_data)

149 : rice_parameter_(rice_parameter),	159 : rice_parameter_(rice_parameter),

150 num_entries_(num_entries),	160 num_entries_(num_entries),

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
229 V4DecodeResult result = GetNextWord(&current_word_);	239 V4DecodeResult result = GetNextWord(&current_word_);

230 if (result != DECODE_SUCCESS) {	240 if (result != DECODE_SUCCESS) {

231 return result;	241 return result;

232 }	242 }

233 }	243 }

234	244

235 unsigned int num_bits_left_in_current_word =	245 unsigned int num_bits_left_in_current_word =

236 kMaxBitIndex - current_word_bit_index_;	246 kMaxBitIndex - current_word_bit_index_;

237 if (num_bits_left_in_current_word >= num_requested_bits) {	247 if (num_bits_left_in_current_word >= num_requested_bits) {

238 // All the bits that we need are in \|current_word_\|.	248 // All the bits that we need are in \|current_word_\|.

239 GetBitsFromCurrentWord(num_requested_bits, x);	249 *x = GetBitsFromCurrentWord(num_requested_bits);

240 } else {	250 } else {

241 // \|current_word_\| contains fewer bits than we need so we store the current	251 // \|current_word_\| contains fewer bits than we need so read the remaining

242 // value of \|current_word_\| in \|lower\|, then read in a new \|current_word_\|,	252 // bits from \|current_word_\| into \|lower\|, and then call GetNextBits on the

243 // and then pick the remaining bits from the new value of \|current_word_\|.	253 // remaining number of bits, which will read in a new word into

244 uint32_t lower = current_word_;	254 // \|current_word_\|.

245 // Bits we need after we read all of \|current_word_\|	255 uint32_t lower = GetBitsFromCurrentWord(num_bits_left_in_current_word);

	256

246 unsigned int num_bits_from_next_word =	257 unsigned int num_bits_from_next_word =

247 num_requested_bits - num_bits_left_in_current_word;	258 num_requested_bits - num_bits_left_in_current_word;

248 uint32_t upper;	259 uint32_t upper;

249 V4DecodeResult result = GetNextWord(&current_word_);	260 V4DecodeResult result = GetNextBits(num_bits_from_next_word, &upper);

250 if (result != DECODE_SUCCESS) {	261 if (result != DECODE_SUCCESS) {

251 return result;	262 return result;

252 }	263 }

253 GetBitsFromCurrentWord(num_bits_from_next_word, &upper);	264 *x = (upper << num_bits_left_in_current_word) \| lower;

254 *x = (upper << (num_requested_bits - num_bits_from_next_word)) \| lower;

255 }	265 }

256 return DECODE_SUCCESS;	266 return DECODE_SUCCESS;

257 }	267 }

258	268

259 void V4RiceDecoder::GetBitsFromCurrentWord(unsigned int num_requested_bits,	269 uint32_t V4RiceDecoder::GetBitsFromCurrentWord(

260 uint32_t* x) {	270 unsigned int num_requested_bits) {

261 uint32_t mask = 0xFFFFFFFF >> (kMaxBitIndex - num_requested_bits);	271 uint32_t mask = 0xFFFFFFFF >> (kMaxBitIndex - num_requested_bits);

262 *x = current_word_ & mask;	272 uint32_t x = current_word_ & mask;

263 current_word_ = current_word_ >> num_requested_bits;	273 current_word_ = current_word_ >> num_requested_bits;

264 current_word_bit_index_ += num_requested_bits;	274 current_word_bit_index_ += num_requested_bits;

	275 return x;

265 };	276 };

266	277

267 std::string V4RiceDecoder::DebugString() const {	278 std::string V4RiceDecoder::DebugString() const {

268 // Calculates the total number of bits that we have read from the buffer,	279 // Calculates the total number of bits that we have read from the buffer,

269 // excluding those that have been read into current_word_ but not yet	280 // excluding those that have been read into current_word_ but not yet

270 // consumed byt GetNextBits().	281 // consumed byt GetNextBits().

271 unsigned bits_read = (data_byte_index_ - sizeof(uint32_t)) * kBitsPerByte +	282 unsigned bits_read = (data_byte_index_ - sizeof(uint32_t)) * kBitsPerByte +

272 current_word_bit_index_;	283 current_word_bit_index_;

273 return base::StringPrintf(	284 return base::StringPrintf(

274 "bits_read: %x; current_word_: %x; data_byte_index_; %x, "	285 "bits_read: %x; current_word_: %x; data_byte_index_; %x, "

275 "current_word_bit_index_: %x; rice_parameter_: %x",	286 "current_word_bit_index_: %x; rice_parameter_: %x",

276 bits_read, current_word_, data_byte_index_, current_word_bit_index_,	287 bits_read, current_word_, data_byte_index_, current_word_bit_index_,

277 rice_parameter_);	288 rice_parameter_);

278 }	289 }

279	290

280 std::ostream& operator<<(std::ostream& os, const V4RiceDecoder& rice_decoder) {	291 std::ostream& operator<<(std::ostream& os, const V4RiceDecoder& rice_decoder) {

281 os << rice_decoder.DebugString();	292 os << rice_decoder.DebugString();

282 return os;	293 return os;

283 }	294 }

284	295

285 } // namespace safe_browsing	296 } // namespace safe_browsing

OLD	NEW