net/disk_cache/v3/index_table.cc - Issue 53313004: Disk cache v3: The main index table.

Side by Side Diff: net/disk_cache/v3/index_table.cc

Issue 53313004: Disk cache v3: The main index table. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: remove From*Address use Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "net/disk_cache/v3/index_table.h"

	6

	7 #include <algorithm>

	8 #include <set>

	9 #include <utility>

	10

	11 #include "base/bits.h"

	12 #include "net/base/io_buffer.h"

	13 #include "net/base/net_errors.h"

	14 #include "net/disk_cache/disk_cache.h"

	15

	16 using base::Time;

	17 using base::TimeDelta;

	18 using disk_cache::CellInfo;

	19 using disk_cache::CellList;

	20 using disk_cache::IndexCell;

	21 using disk_cache::IndexIterator;

	22

	23 namespace {

	24

	25 const int kCellHashOffset = 22;

	26 const int kCellSmallTableHashOffset = 16;

	27 const int kCellTimestampOffset = 40;

	28 const int kCellReuseOffset = 60;

	29 const int kCellGroupOffset = 3;

	30 const int kCellSumOffset = 6;

	31

	32 const uint64 kCellAddressMask = 0x3FFFFF;

	33 const uint64 kCellSmallTableAddressMask = 0xFFFF;

	34 const uint64 kCellHashMask = 0x3FFFF;

	35 const uint64 kCellSmallTableHashMask = 0xFFFFFF;

	36 const uint64 kCellTimestampMask = 0xFFFFF;

	37 const uint64 kCellReuseMask = 0xF;

	38 const uint8 kCellStateMask = 0x7;

	39 const uint8 kCellGroupMask = 0x7;

	40 const uint8 kCellSumMask = 0x3;

	41

	42 const int kHashShift = 14;

	43 const int kHashSmallTableShift = 8;

	44

	45 // Unfortunately we have to break the abstaction a little here: the file number

	46 // where entries are stored is outside of the control of this code, and it is

	47 // usually part of the stored address. However, for small tables we only store

	48 // 16 bits of the address so the file number is never stored on a cell. We have

	49 // to infere the file number from the type of entry (normal vs evicted), and

	50 // the knowledge that given that the table will not keep more than 64k entries,

	51 // a single file of each type is enough.

	52 const int kEntriesFile = disk_cache::BLOCK_ENTRIES - 1;

	53 const int kEvictedEntriesFile = disk_cache::BLOCK_EVICTED - 1;

	54 const int kMaxAddress = 1 << 22;

	55 const int kMinFileNumber = 1 << 16;

	56

	57 uint32 GetCellAddress(const IndexCell& cell) {

	58 return cell.first_part & kCellAddressMask;

	59 }

	60

	61 uint32 GetCellSmallTableAddress(const IndexCell& cell) {

	62 return cell.first_part & kCellSmallTableAddressMask;

	63 }

	64

	65 uint32 GetCellHash(const IndexCell& cell) {

	66 return (cell.first_part >> kCellHashOffset) & kCellHashMask;

	67 }

	68

	69 uint32 GetCellSmallTableHash(const IndexCell& cell) {

	70 return (cell.first_part >> kCellSmallTableHashOffset) &

	71 kCellSmallTableHashMask;

	72 }

	73

	74 int GetCellTimestamp(const IndexCell& cell) {

	75 return (cell.first_part >> kCellTimestampOffset) & kCellTimestampMask;

	76 }

	77

	78 int GetCellReuse(const IndexCell& cell) {

	79 return (cell.first_part >> kCellReuseOffset) & kCellReuseMask;

	80 }

	81

	82 int GetCellState(const IndexCell& cell) {

	83 return cell.last_part & kCellStateMask;

	84 }

	85

	86 int GetCellGroup(const IndexCell& cell) {

	87 return (cell.last_part >> kCellGroupOffset) & kCellGroupMask;

	88 }

	89

	90 int GetCellSum(const IndexCell& cell) {

	91 return (cell.last_part >> kCellSumOffset) & kCellSumMask;

	92 }

	93

	94 void SetCellAddress(IndexCell* cell, uint32 address) {

	95 DCHECK_LE(address, static_cast<uint32>(kCellAddressMask));

	96 cell->first_part &= ~kCellAddressMask;

	97 cell->first_part \|= address;

	98 }

	99

	100 void SetCellSmallTableAddress(IndexCell* cell, uint32 address) {

	101 DCHECK_LE(address, static_cast<uint32>(kCellSmallTableAddressMask));

	102 cell->first_part &= ~kCellSmallTableAddressMask;

	103 cell->first_part \|= address;

	104 }

	105

	106 void SetCellHash(IndexCell* cell, uint32 hash) {

	107 DCHECK_LE(hash, static_cast<uint32>(kCellHashMask));

	108 cell->first_part &= ~(kCellHashMask << kCellHashOffset);

	109 cell->first_part \|= static_cast<int64>(hash) << kCellHashOffset;

	110 }

	111

	112 void SetCellSmallTableHash(IndexCell* cell, uint32 hash) {

	113 DCHECK_LE(hash, static_cast<uint32>(kCellSmallTableHashMask));

	114 cell->first_part &= ~(kCellSmallTableHashMask << kCellSmallTableHashOffset);

	115 cell->first_part \|= static_cast<int64>(hash) << kCellSmallTableHashOffset;

	116 }

	117

	118 void SetCellTimestamp(IndexCell* cell, int timestamp) {

	119 DCHECK_LT(timestamp, 1 << 20);

	120 DCHECK_GE(timestamp, 0);

	121 cell->first_part &= ~(kCellTimestampMask << kCellTimestampOffset);

	122 cell->first_part \|= static_cast<int64>(timestamp) << kCellTimestampOffset;

	123 }

	124

	125 void SetCellReuse(IndexCell* cell, int count) {

	126 DCHECK_LT(count, 16);

	127 DCHECK_GE(count, 0);

	128 cell->first_part &= ~(kCellReuseMask << kCellReuseOffset);

	129 cell->first_part \|= static_cast<int64>(count) << kCellReuseOffset;

	130 }

	131

	132 void SetCellState(IndexCell* cell, disk_cache::EntryState state) {

	133 cell->last_part &= ~kCellStateMask;

	134 cell->last_part \|= state;

	135 }

	136

	137 void SetCellGroup(IndexCell* cell, disk_cache::EntryGroup group) {

	138 cell->last_part &= ~(kCellGroupMask << kCellGroupOffset);

	139 cell->last_part \|= group << kCellGroupOffset;

	140 }

	141

	142 void SetCellSum(IndexCell* cell, int sum) {

	143 DCHECK_LT(sum, 4);

	144 DCHECK_GE(sum, 0);

	145 cell->last_part &= ~(kCellSumMask << kCellSumOffset);

	146 cell->last_part \|= sum << kCellSumOffset;

	147 }

	148

	149 // This is a very particular way to calculate the sum, so it will not match if

	150 // compared a gainst a pure 2 bit, modulo 2 sum.

	151 int CalculateCellSum(const IndexCell& cell) {

	152 uint32* words = bit_cast<uint32*>(&cell);

	153 uint8* bytes = bit_cast<uint8*>(&cell);

	154 uint32 result = words[0] + words[1];

	155 result += result >> 16;

	156 result += (result >> 8) + (bytes[8] & 0x3f);

	157 result += result >> 4;

	158 result += result >> 2;

	159 return result & 3;

	160 }

	161

	162 bool SanityCheck(const IndexCell& cell) {

	163 if (GetCellSum(cell) != CalculateCellSum(cell))

	164 return false;

	165

	166 if (GetCellState(cell) > disk_cache::ENTRY_USED \|\|

	167 GetCellGroup(cell) == disk_cache::ENTRY_RESERVED \|\|

	168 GetCellGroup(cell) > disk_cache::ENTRY_EVICTED) {

	169 return false;

	170 }

	171

	172 return true;

	173 }

	174

	175 int FileNumberFromAddress(int index_address) {

	176 return index_address / kMinFileNumber;

	177 }

	178

	179 int StartBlockFromAddress(int index_address) {

	180 return index_address % kMinFileNumber;

	181 }

	182

	183 bool IsValidAddress(disk_cache::Addr address) {

	184 if (!address.is_initialized() \|\|

	185 (address.file_type() != disk_cache::BLOCK_EVICTED &&

	186 address.file_type() != disk_cache::BLOCK_ENTRIES)) {

	187 return false;

	188 }

	189

	190 return address.FileNumber() < FileNumberFromAddress(kMaxAddress);

	191 }

	192

	193 bool IsNormalState(const IndexCell& cell) {

	194 disk_cache::EntryState state =

	195 static_cast<disk_cache::EntryState>(GetCellState(cell));

	196 DCHECK_NE(state, disk_cache::ENTRY_FREE);

	197 return state != disk_cache::ENTRY_DELETED &&

	198 state != disk_cache::ENTRY_FIXING;

	199 }

	200

	201 inline int GetNextBucket(int min_bucket_id, int max_bucket_id,

	202 disk_cache::IndexBucket* table,

	203 disk_cache::IndexBucket** bucket) {

	204 if (!(*bucket)->next)

	205 return 0;

	206

	207 int bucket_id = (*bucket)->next / disk_cache::kCellsPerBucket;

	208 if (bucket_id < min_bucket_id \|\| bucket_id > max_bucket_id) {

	209 (*bucket)->next = 0;

	210 return 0;

	211 }

	212 *bucket = &table[bucket_id - min_bucket_id];

	213 return bucket_id;

	214 }

	215

	216 // Updates the \|iterator\| with the current \|cell\|. This cell may cause all

	217 // previous cells to be deleted (when a new target timestamp is found), the cell

	218 // may be added to the list (if it matches the target timestamp), or may it be

	219 // ignored.

	220 void UpdateIterator(const disk_cache::EntryCell& cell,

	221 int limit_time,

	222 IndexIterator* iterator) {

	223 int time = cell.GetTimestamp();

	224 // Look for not interesting times.

	225 if (iterator->forward && time <= limit_time)

	226 return;

	227 if (!iterator->forward && time >= limit_time)

	228 return;

	229

	230 if ((iterator->forward && time < iterator->timestamp) \|\|

	231 (!iterator->forward && time > iterator->timestamp)) {

	232 // This timestamp is better than the one we had.

	233 iterator->timestamp = time;

	234 iterator->cells.clear();

	235 }

	236 if (time == iterator->timestamp) {

	237 CellInfo cell_info = { cell.hash(), cell.GetAddress() };

	238 iterator->cells.push_back(cell_info);

	239 }

	240 }

	241

	242 void InitIterator(IndexIterator* iterator) {

	243 iterator->cells.clear();

	244 iterator->timestamp = iterator->forward ? kint32max : 0;

	245 }

	246

	247 } // namespace

	248

	249 namespace disk_cache {

	250

	251 EntryCell::~EntryCell() {

	252 }

	253

	254 bool EntryCell::IsValid() const {

	255 return GetCellAddress(cell_) != 0;

	256 }

	257

	258 // This code has to map the cell address (up to 22 bits) to a general cache Addr

	259 // (up to 24 bits of general addressing). It also set the implied file_number

	260 // in the case of small tables. See also the comment by the definition of

	261 // kEntriesFile.

	262 Addr EntryCell::GetAddress() const {

	263 uint32 address_value = GetAddressValue();

	264 int file_number = FileNumberFromAddress(address_value);

	265 if (small_table_) {

	266 DCHECK_EQ(0, file_number);

	267 file_number = (GetGroup() == ENTRY_EVICTED) ? kEvictedEntriesFile :

	268 kEntriesFile;

	269 }

	270 DCHECK_NE(0, file_number);

	271 FileType file_type = (GetGroup() == ENTRY_EVICTED) ? BLOCK_EVICTED :

	272 BLOCK_ENTRIES;

	273 return Addr(file_type, 1, file_number, StartBlockFromAddress(address_value));

	274 }

	275

	276 EntryState EntryCell::GetState() const {

	277 return static_cast<EntryState>(cell_.last_part & kCellStateMask);

	278 }

	279

	280 EntryGroup EntryCell::GetGroup() const {

	281 return static_cast<EntryGroup>((cell_.last_part >> kCellGroupOffset) &

	282 kCellGroupMask);

	283 }

	284

	285 int EntryCell::GetReuse() const {

	286 return (cell_.first_part >> kCellReuseOffset) & kCellReuseMask;

	287 }

	288

	289 int EntryCell::GetTimestamp() const {

	290 return GetCellTimestamp(cell_);

	291 }

	292

	293 void EntryCell::SetState(EntryState state) {

	294 SetCellState(&cell_, state);

	295 }

	296

	297 void EntryCell::SetGroup(EntryGroup group) {

	298 SetCellGroup(&cell_, group);

	299 }

	300

	301 void EntryCell::SetReuse(int count) {

	302 SetCellReuse(&cell_, count);

	303 }

	304

	305 void EntryCell::SetTimestamp(int timestamp) {

	306 SetCellTimestamp(&cell_, timestamp);

	307 }

	308

	309 // Static.

	310 EntryCell EntryCell::GetEntryCellForTest(int32 cell_id,

	311 uint32 hash,

	312 Addr address,

	313 IndexCell* cell,

	314 bool small_table) {

	315 if (cell) {

	316 EntryCell entry_cell(cell_id, hash, *cell, small_table);

	317 return entry_cell;

	318 }

	319

	320 return EntryCell(cell_id, hash, address, small_table);

	321 }

	322

	323 void EntryCell::SerializaForTest(IndexCell* destination) {

	324 FixSum();

	325 Serialize(destination);

	326 }

	327

	328 EntryCell::EntryCell() : cell_id_(0), hash_(0), small_table_(false) {

	329 cell_.Clear();

	330 }

	331

	332 EntryCell::EntryCell(int32 cell_id, uint32 hash, Addr address, bool small_table)

	333 : cell_id_(cell_id),

	334 hash_(hash),

	335 small_table_(small_table) {

	336 DCHECK(IsValidAddress(address) \|\| !address.value());

	337

	338 cell_.Clear();

	339 SetCellState(&cell_, ENTRY_NEW);

	340 SetCellGroup(&cell_, ENTRY_NO_USE);

	341 if (small_table) {

	342 DCHECK(address.FileNumber() == kEntriesFile \|\|

	343 address.FileNumber() == kEvictedEntriesFile);

	344 SetCellSmallTableAddress(&cell_, address.start_block());

	345 SetCellSmallTableHash(&cell_, hash >> kHashSmallTableShift);

	346 } else {

	347 SetCellAddress(&cell_, address.ToIndexEntryAddress());

	348 SetCellHash(&cell_, hash >> kHashShift);

	349 }

	350 }

	351

	352 EntryCell::EntryCell(int32 cell_id,

	353 uint32 hash,

	354 const IndexCell& cell,

	355 bool small_table)

	356 : cell_id_(cell_id),

	357 hash_(hash),

	358 cell_(cell),

	359 small_table_(small_table) {

	360 }

	361

	362 void EntryCell::FixSum() {

	363 SetCellSum(&cell_, CalculateCellSum(cell_));

	364 }

	365

	366 uint32 EntryCell::GetAddressValue() const {

	367 if (small_table_)

	368 return GetCellSmallTableAddress(cell_);

	369

	370 return GetCellAddress(cell_);

	371 }

	372

	373 uint32 EntryCell::RecomputeHash() {

	374 if (small_table_) {

	375 hash_ &= (1 << kHashSmallTableShift) - 1;

	376 hash_ \|= GetCellSmallTableHash(cell_) << kHashSmallTableShift;

	377 return hash_;

	378 }

	379

	380 hash_ &= (1 << kHashShift) - 1;

	381 hash_ \|= GetCellHash(cell_) << kHashShift;

	382 return hash_;

	383 }

	384

	385 void EntryCell::Serialize(IndexCell* destination) const {

	386 *destination = cell_;

	387 }

	388

	389 EntrySet::EntrySet() : evicted_count(0), current(0) {

	390 }

	391

	392 EntrySet::~EntrySet() {

	393 }

	394

	395 IndexIterator::IndexIterator() {

	396 }

	397

	398 IndexIterator::~IndexIterator() {

	399 }

	400

	401 IndexTableInitData::IndexTableInitData() {

	402 }

	403

	404 IndexTableInitData::~IndexTableInitData() {

	405 }

	406

	407 // -----------------------------------------------------------------------

	408

	409 IndexTable::IndexTable(IndexTableBackend* backend)

	410 : backend_(backend),

	411 header_(NULL),

	412 main_table_(NULL),

	413 extra_table_(NULL),

	414 modified_(false),

	415 small_table_(false) {

	416 }

	417

	418 IndexTable::~IndexTable() {

	419 }

	420

	421 // For a general description of the index tables see:

	422 // http://www.chromium.org/developers/design-documents/network-stack/disk-cache/ disk-cache-v3#TOC-Index

	423 //

	424 // The index is split between two tables: the main_table_ and the extra_table_.

	425 // The main table can grow only by doubling its number of cells, while the

	426 // extra table can grow slowly, because it only contain cells that overflow

	427 // from the main table. In order to locate a given cell, part of the hash is

	428 // used directly as an index into the main table; once that bucket is located,

	429 // all cells with that partial hash (i.e., belonging to that bucket) are

	430 // inspected, and if present, the next bucket (located on the extra table) is

	431 // then located. For more information on bucket chaining see:

	432 // http://www.chromium.org/developers/design-documents/network-stack/disk-cache/ disk-cache-v3#TOC-Buckets

	433 //

	434 // There are two cases when increasing the size:

	435 // - Doubling the size of the main table

	436 // - Adding more entries to the extra table

	437 //

	438 // For example, consider a 64k main table with 8k cells on the extra table (for

	439 // a total of 72k cells). Init can be called to add another 8k cells at the end

	440 // (grow to 80k cells). When the size of the extra table approaches 64k, Init

	441 // can be called to double the main table (to 128k) and go back to a small extra

	442 // table.

	443 void IndexTable::Init(IndexTableInitData* params) {

	444 bool growing = header_ != NULL;

	445 scoped_ptr<IndexBucket[]> old_extra_table;

	446 header_ = &params->index_bitmap->header;

	447

	448 if (params->main_table) {

	449 if (main_table_) {

	450 // This is doubling the size of main table.

	451 DCHECK_EQ(base::bits::Log2Floor(header_->table_len),

	452 base::bits::Log2Floor(backup_header_->table_len) + 1);

	453 int extra_size = (header()->max_bucket - mask_) * kCellsPerBucket;

	454 DCHECK_GE(extra_size, 0);
	Randy Smith (Not in Mondays) 2013/11/25 19:48:07 Does this mean we always have to incrementally gro Does this mean we always have to incrementally grow the extra table in between doublings of the main table? Why put that restriction on it? rvargas (doing something else) 2013/11/26 00:32:41 Nope. It can be zero. Show quoted text On 2013/11/25 19:48:07, rdsmith wrote: > Does this mean we always have to incrementally grow the extra table in between > doublings of the main table? Why put that restriction on it? Nope. It can be zero. Randy Smith (Not in Mondays) 2013/12/02 21:48:06 Huh. I guess I had a braino; dunno why I thought Show quoted text On 2013/11/26 00:32:41, rvargas wrote: > On 2013/11/25 19:48:07, rdsmith wrote: > > Does this mean we always have to incrementally grow the extra table in between > > doublings of the main table? Why put that restriction on it? > > Nope. It can be zero. Huh. I guess I had a braino; dunno why I thought it was GT.
	455

	456 // Doubling the size implies deleting the extra table and moving as many

	457 // cells as we can to the main table, so we first copy the old one. This

	458 // is not required when just growing the extra table because we don't

	459 // move any cell in that case.

	460 old_extra_table.reset(new IndexBucket[extra_size]);

	461 memcpy(old_extra_table.get(), extra_table_,

	462 extra_size * sizeof(IndexBucket));

	463 memset(params->extra_table, 0, extra_size * sizeof(IndexBucket));
	Randy Smith (Not in Mondays) 2013/11/25 19:48:07 It feels to me like there are assumptions in this It feels to me like there are assumptions in this code about aliasing, but I'm not certain what they are. Let me go over what I believe the context for this code is and make comments conditional on that. I believe that this function will be called in only three situations: * First initialization. main_table_ will be null, params->main_table will point to memory mapped from a file, and params->extra_table will be null. * Growth of extra table. params->main_table will be NULL. params->extra_table != extra_table_, but will point to a memory mapped region of the same file, only larger. Thus any changes made to params->extra_table will be reflected in extra_table_ and vice versa (for the length that extra_table_ is mapped for). extra_table_ will be unmapped after this function returns (does the calling code have it's own pointer to extra_table_?) * Doubling of main table. params->main_table != main_table_, but will point to a memory mapped region of the same file, only doubled in size. params->extra_table will point to a section of memory mapped to the extra table file, that hasn't changed in size any. params->extra_table != extra_table_ and the location pointed to by extra_table_ will be unmapped after this routine finishes (??), but the two memory regions will be direct aliases to one another. (I'll probably want a comment clarifying these assumptions somewhere, but I'll wait to suggest what until I'm certain I understand it all. :-}) Assuming that that's correct, why refer to the same data through two different points (extra_table_ and params->extra_table)? It seems like it's just going to generate confusion. rvargas (doing something else) 2013/11/26 00:32:41 params->extra_table should not be null. Show quoted text On 2013/11/25 19:48:07, rdsmith wrote: > It feels to me like there are assumptions in this code about aliasing, but I'm > not certain what they are. Let me go over what I believe the context for this > code is and make comments conditional on that. > > I believe that this function will be called in only three situations: > * First initialization. main_table_ will be null, params->main_table will point > to memory mapped from a file, and params->extra_table will be null. params->extra_table should not be null. Show quoted text > * Growth of extra table. params->main_table will be NULL. params->extra_table > != extra_table_, but will point to a memory mapped region of the same file, only > larger. Thus any changes made to params->extra_table will be reflected in > extra_table_ and vice versa (for the length that extra_table_ is mapped for). > extra_table_ will be unmapped after this function returns (does the calling code > have it's own pointer to extra_table_?) Yes, someone will know how to unmap the old table after this method returns. Show quoted text > * Doubling of main table. params->main_table != main_table_, but will point to > a memory mapped region of the same file, only doubled in size. > params->extra_table will point to a section of memory mapped to the extra table > file, that hasn't changed in size any. params->extra_table != extra_table_ and > the location pointed to by extra_table_ will be unmapped after this routine > finishes (??), but the two memory regions will be direct aliases to one another. sounds correct. The underlying assumption is that the index is memory mapped and follows the format (and files) described in disk_format_v3.h, so the act of re-initializing the IndexTable is a consequence of growing some file and creating another map of the new extension. As such, we should never receive the same pointer that we are currently using (but there's no check for that, because it shouldn't really matter), and we deal with pointers not mapped sections (in fact all the unit tests just allocate memory, never map anything). Aliasing is important in that we don't memcpy data from the old mapping to the new mapping when the file just grows, as that would be a waste of time. So yes, this has a memory interface, but expects the caller to deal with the files as defined by the file format, not just random pointers (hence TestCacheTables::CopyFrom simulating that behavior) But that should be clear from the comment at the top of index_table.h, right? Show quoted text > (I'll probably want a comment clarifying these assumptions somewhere, but I'll > wait to suggest what until I'm certain I understand it all. :-}) > > Assuming that that's correct, why refer to the same data through two different > points (extra_table_ and params->extra_table)? It seems like it's just going to > generate confusion. Do I? I try to refer to the data that logically is being accessed... so for instance line 461 copies the old data somewhere safe and line 463 clears the storage for the new data. Production code could copy the new data and clear the old data, but that would confuse the reader and prevent the unit tests from working :) I mean, the whole thing requires mapping (or a judicious emulation from the caller), but if this code is misusing source or destination somewhere assuming that they are the same, let me know so that it's fixed. So, conceptually, the caller could suspend all requests to this class, copy the file to another file and map the new file (so that there is no aliasing) before calling into this class again asking for re-init. Randy Smith (Not in Mondays) 2013/12/02 21:48:06 So what's the size of the extra_table on init? Ju Show quoted text On 2013/11/26 00:32:41, rvargas wrote: > On 2013/11/25 19:48:07, rdsmith wrote: > > It feels to me like there are assumptions in this code about aliasing, but I'm > > not certain what they are. Let me go over what I believe the context for > this > > code is and make comments conditional on that. > > > > I believe that this function will be called in only three situations: > > * First initialization. main_table_ will be null, params->main_table will > point > > to memory mapped from a file, and params->extra_table will be null. > > params->extra_table should not be null. So what's the size of the extra_table on init? Just the header? Show quoted text > > > * Growth of extra table. params->main_table will be NULL. > params->extra_table > > != extra_table_, but will point to a memory mapped region of the same file, > only > > larger. Thus any changes made to params->extra_table will be reflected in > > extra_table_ and vice versa (for the length that extra_table_ is mapped for). > > extra_table_ will be unmapped after this function returns (does the calling > code > > have it's own pointer to extra_table_?) > > Yes, someone will know how to unmap the old table after this method returns. > > > * Doubling of main table. params->main_table != main_table_, but will point > to > > a memory mapped region of the same file, only doubled in size. > > params->extra_table will point to a section of memory mapped to the extra > table > > file, that hasn't changed in size any. params->extra_table != extra_table_ > and > > the location pointed to by extra_table_ will be unmapped after this routine > > finishes (??), but the two memory regions will be direct aliases to one > another. > > sounds correct. > > The underlying assumption is that the index is memory mapped and follows the > format (and files) described in disk_format_v3.h, so the act of re-initializing > the IndexTable is a consequence of growing some file and creating another map of > the new extension. As such, we should never receive the same pointer that we are > currently using (but there's no check for that, because it shouldn't really > matter), and we deal with pointers not mapped sections (in fact all the unit > tests just allocate memory, never map anything). > > Aliasing is important in that we don't memcpy data from the old mapping to the > new mapping when the file just grows, as that would be a waste of time. So yes, > this has a memory interface, but expects the caller to deal with the files as > defined by the file format, not just random pointers (hence > TestCacheTables::CopyFrom simulating that behavior) > > But that should be clear from the comment at the top of index_table.h, right? Re-reading that comment, I find it more abstract that I'd like; I'd rather the aliasing assumptions were made explicit, either in the comment to this function or in the comments to IndexTableInitData. But see later. Show quoted text > > (I'll probably want a comment clarifying these assumptions somewhere, but I'll > > wait to suggest what until I'm certain I understand it all. :-}) > > > > Assuming that that's correct, why refer to the same data through two different > > points (extra_table_ and params->extra_table)? It seems like it's just going > to > > generate confusion. > > Do I? I try to refer to the data that logically is being accessed... so for > instance line 461 copies the old data somewhere safe and line 463 clears the > storage for the new data. Production code could copy the new data and clear the > old data, but that would confuse the reader and prevent the unit tests from > working :) > > I mean, the whole thing requires mapping (or a judicious emulation from the > caller), but if this code is misusing source or destination somewhere assuming > that they are the same, let me know so that it's fixed. > > So, conceptually, the caller could suspend all requests to this class, copy the > file to another file and map the new file (so that there is no aliasing) before > calling into this class again asking for re-init. Ah! Thank you. That's a (hypothetical) use case that's very useful for me. Can we put in a comment (before this function or in IndexTableInitData) that if this function is being called a second time, the caller of Init guarantees that the data pointed to by params->main_table will be an improper (i.e. possibly identical) super-set of what was previously passed, and that if main_table isn't being grown, params->extra_table will similarly be a super-set of what was previously passed? I'd also like to have the callers responsibility to deallocate the memory that main_table_ and extra_table_ used to point to made explicit. If one isn't thinking about this function in terms of memory mapped files, it's somewhat surprising; it effectively means that the caller owns that memory, which I don't think is the default assumption one would have looking at the call. rvargas (doing something else) 2013/12/04 01:04:17 There is no header on the extra table, but it has Show quoted text On 2013/12/02 21:48:06, rdsmith wrote: > On 2013/11/26 00:32:41, rvargas wrote: > > On 2013/11/25 19:48:07, rdsmith wrote: > > > It feels to me like there are assumptions in this code about aliasing, but > I'm > > > not certain what they are. Let me go over what I believe the context for > > this > > > code is and make comments conditional on that. > > > > > > I believe that this function will be called in only three situations: > > > * First initialization. main_table_ will be null, params->main_table will > > point > > > to memory mapped from a file, and params->extra_table will be null. > > > > params->extra_table should not be null. > > So what's the size of the extra_table on init? Just the header? There is no header on the extra table, but it has to have some space there to be able to store something. The reason is that no matter how big the main table is, it may take as little as 5 entries to require space on the extra table. Backend unit tests start with something like 8 cells, and production code starts with 1/8th of the main table. Show quoted text > > > > > > * Growth of extra table. params->main_table will be NULL. > > params->extra_table > > > != extra_table_, but will point to a memory mapped region of the same file, > > only > > > larger. Thus any changes made to params->extra_table will be reflected in > > > extra_table_ and vice versa (for the length that extra_table_ is mapped > for). > > > extra_table_ will be unmapped after this function returns (does the calling > > code > > > have it's own pointer to extra_table_?) > > > > Yes, someone will know how to unmap the old table after this method returns. > > > > > * Doubling of main table. params->main_table != main_table_, but will point > > to > > > a memory mapped region of the same file, only doubled in size. > > > params->extra_table will point to a section of memory mapped to the extra > > table > > > file, that hasn't changed in size any. params->extra_table != extra_table_ > > and > > > the location pointed to by extra_table_ will be unmapped after this routine > > > finishes (??), but the two memory regions will be direct aliases to one > > another. > > > > sounds correct. > > > > The underlying assumption is that the index is memory mapped and follows the > > format (and files) described in disk_format_v3.h, so the act of > re-initializing > > the IndexTable is a consequence of growing some file and creating another map > of > > the new extension. As such, we should never receive the same pointer that we > are > > currently using (but there's no check for that, because it shouldn't really > > matter), and we deal with pointers not mapped sections (in fact all the unit > > tests just allocate memory, never map anything). > > > > Aliasing is important in that we don't memcpy data from the old mapping to the > > new mapping when the file just grows, as that would be a waste of time. So > yes, > > this has a memory interface, but expects the caller to deal with the files as > > defined by the file format, not just random pointers (hence > > TestCacheTables::CopyFrom simulating that behavior) > > > > But that should be clear from the comment at the top of index_table.h, right? > > Re-reading that comment, I find it more abstract that I'd like; I'd rather the > aliasing assumptions were made explicit, either in the comment to this function > or in the comments to IndexTableInitData. But see later. I'm a little worried about this comment. The header mentions multiple files that can be remapped while the cache is working and that management is external to this file, so it looks pretty concrete to me. Without the files being mmapped, the whole IndexTable would be completely impractical so that's not exactly a detail. I'll extend the comment for Init() Show quoted text > > > > (I'll probably want a comment clarifying these assumptions somewhere, but > I'll > > > wait to suggest what until I'm certain I understand it all. :-}) > > > > > > Assuming that that's correct, why refer to the same data through two > different > > > points (extra_table_ and params->extra_table)? It seems like it's just > going > > to > > > generate confusion. > > > > Do I? I try to refer to the data that logically is being accessed... so for > > instance line 461 copies the old data somewhere safe and line 463 clears the > > storage for the new data. Production code could copy the new data and clear > the > > old data, but that would confuse the reader and prevent the unit tests from > > working :) > > > > I mean, the whole thing requires mapping (or a judicious emulation from the > > caller), but if this code is misusing source or destination somewhere assuming > > that they are the same, let me know so that it's fixed. > > > > So, conceptually, the caller could suspend all requests to this class, copy > the > > file to another file and map the new file (so that there is no aliasing) > before > > calling into this class again asking for re-init. > > Ah! Thank you. That's a (hypothetical) use case that's very useful for me. > Can we put in a comment (before this function or in IndexTableInitData) that if > this function is being called a second time, the caller of Init guarantees that > the data pointed to by params->main_table will be an improper (i.e. possibly > identical) super-set of what was previously passed, and that if main_table isn't > being grown, params->extra_table will similarly be a super-set of what was > previously passed? > > I'd also like to have the callers responsibility to deallocate the memory that > main_table_ and extra_table_ used to point to made explicit. If one isn't > thinking about this function in terms of memory mapped files, it's somewhat > surprising; it effectively means that the caller owns that memory, which I don't > think is the default assumption one would have looking at the call. > The memory ownership should be clear by the definition of IndexTableInitData. There is ownership transfer for the backups, but not for the bitmap or tables. That's the (relatively new) coding standard so it feels a little weird if that is repeated by comments. Randy Smith (Not in Mondays) 2013/12/05 19:17:48 I'm good with the IndexTable code explicitly relyi Show quoted text On 2013/12/04 01:04:17, rvargas wrote: > On 2013/12/02 21:48:06, rdsmith wrote: > > On 2013/11/26 00:32:41, rvargas wrote: > > > On 2013/11/25 19:48:07, rdsmith wrote: > > > > It feels to me like there are assumptions in this code about aliasing, but > > I'm > > > > not certain what they are. Let me go over what I believe the context for > > > this > > > > code is and make comments conditional on that. > > > > > > > > I believe that this function will be called in only three situations: > > > > * First initialization. main_table_ will be null, params->main_table will > > > point > > > > to memory mapped from a file, and params->extra_table will be null. > > > > > > params->extra_table should not be null. > > > > So what's the size of the extra_table on init? Just the header? > > There is no header on the extra table, but it has to have some space there to be > able to store something. The reason is that no matter how big the main table is, > it may take as little as 5 entries to require space on the extra table. Backend > unit tests start with something like 8 cells, and production code starts with > 1/8th of the main table. > > > > > > > > > > * Growth of extra table. params->main_table will be NULL. > > > params->extra_table > > > > != extra_table_, but will point to a memory mapped region of the same > file, > > > only > > > > larger. Thus any changes made to params->extra_table will be reflected in > > > > extra_table_ and vice versa (for the length that extra_table_ is mapped > > for). > > > > extra_table_ will be unmapped after this function returns (does the > calling > > > code > > > > have it's own pointer to extra_table_?) > > > > > > Yes, someone will know how to unmap the old table after this method returns. > > > > > > > * Doubling of main table. params->main_table != main_table_, but will > point > > > to > > > > a memory mapped region of the same file, only doubled in size. > > > > params->extra_table will point to a section of memory mapped to the extra > > > table > > > > file, that hasn't changed in size any. params->extra_table != > extra_table_ > > > and > > > > the location pointed to by extra_table_ will be unmapped after this > routine > > > > finishes (??), but the two memory regions will be direct aliases to one > > > another. > > > > > > sounds correct. > > > > > > The underlying assumption is that the index is memory mapped and follows the > > > format (and files) described in disk_format_v3.h, so the act of > > re-initializing > > > the IndexTable is a consequence of growing some file and creating another > map > > of > > > the new extension. As such, we should never receive the same pointer that we > > are > > > currently using (but there's no check for that, because it shouldn't really > > > matter), and we deal with pointers not mapped sections (in fact all the unit > > > tests just allocate memory, never map anything). > > > > > > Aliasing is important in that we don't memcpy data from the old mapping to > the > > > new mapping when the file just grows, as that would be a waste of time. So > > yes, > > > this has a memory interface, but expects the caller to deal with the files > as > > > defined by the file format, not just random pointers (hence > > > TestCacheTables::CopyFrom simulating that behavior) > > > > > > But that should be clear from the comment at the top of index_table.h, > right? > > > > Re-reading that comment, I find it more abstract that I'd like; I'd rather the > > aliasing assumptions were made explicit, either in the comment to this > function > > or in the comments to IndexTableInitData. But see later. > > I'm a little worried about this comment. The header mentions multiple files that > can be remapped while the cache is working and that management is external to > this file, so it looks pretty concrete to me. Without the files being mmapped, > the whole IndexTable would > be completely impractical so that's not exactly a detail. > > I'll extend the comment for Init() I'm good with the IndexTable code explicitly relying on the memory mapping. My preference would be that that memory mapping and the aliasing it implies was called out actually in a comment near the code, but if you'd prefer a pointer to the design doc I'm ok with that. Show quoted text > > > > > > > > (I'll probably want a comment clarifying these assumptions somewhere, but > > I'll > > > > wait to suggest what until I'm certain I understand it all. :-}) > > > > > > > > Assuming that that's correct, why refer to the same data through two > > different > > > > points (extra_table_ and params->extra_table)? It seems like it's just > > going > > > to > > > > generate confusion. > > > > > > Do I? I try to refer to the data that logically is being accessed... so for > > > instance line 461 copies the old data somewhere safe and line 463 clears the > > > storage for the new data. Production code could copy the new data and clear > > the > > > old data, but that would confuse the reader and prevent the unit tests from > > > working :) > > > > > > I mean, the whole thing requires mapping (or a judicious emulation from the > > > caller), but if this code is misusing source or destination somewhere > assuming > > > that they are the same, let me know so that it's fixed. > > > > > > So, conceptually, the caller could suspend all requests to this class, copy > > the > > > file to another file and map the new file (so that there is no aliasing) > > before > > > calling into this class again asking for re-init. > > > > Ah! Thank you. That's a (hypothetical) use case that's very useful for me. > > Can we put in a comment (before this function or in IndexTableInitData) that > if > > this function is being called a second time, the caller of Init guarantees > that > > the data pointed to by params->main_table will be an improper (i.e. possibly > > identical) super-set of what was previously passed, and that if main_table > isn't > > being grown, params->extra_table will similarly be a super-set of what was > > previously passed? > > > > I'd also like to have the callers responsibility to deallocate the memory that > > main_table_ and extra_table_ used to point to made explicit. If one isn't > > thinking about this function in terms of memory mapped files, it's somewhat > > surprising; it effectively means that the caller owns that memory, which I > don't > > think is the default assumption one would have looking at the call. > > > > The memory ownership should be clear by the definition of IndexTableInitData. > There is ownership transfer for the backups, but not for the bitmap or tables. > That's the (relatively new) coding standard so it feels a little weird if that > is repeated by comments. Ok, that makes sense. Show quoted text >
	464 }

	465 main_table_ = params->main_table;

	466 }

	467 DCHECK(main_table_);

	468 extra_table_ = params->extra_table;

	469

	470 extra_bits_ = base::bits::Log2Floor(header_->table_len) -

	471 base::bits::Log2Floor(kBaseTableLen);

	472 DCHECK_GE(extra_bits_, 0);

	473 DCHECK_LE(extra_bits_, 11);
	Randy Smith (Not in Mondays) 2013/11/25 19:48:07 What results in this 11? This seems like you're s What results in this 11? This seems like you're simply saying that you can't have a table_len greater than 32 bits, in which case don't you want to just say that? (I.e. IIRTCC the 11 is dependent on a specific value of kBaseTableLen, and that dependency should be explicit in the DCHECK). rvargas (doing something else) 2013/11/26 00:32:41 This is just saying that the table can be doubled Show quoted text On 2013/11/25 19:48:07, rdsmith wrote: > What results in this 11? This seems like you're simply saying that you can't > have a table_len greater than 32 bits, in which case don't you want to just say > that? (I.e. IIRTCC the 11 is dependent on a specific value of kBaseTableLen, > and that dependency should be explicit in the DCHECK). This is just saying that the table can be doubled 11 times from the smaller table to the biggest table. Yes, the value (11) is derived from kBaseTableLen and maybe kMaxAddress or kCellAddressMask, but I thought it was clearer to have the number of bits here (right when we are saying "how many extra bits can we have") rather to come out with a probably cryptic formula that derives the proper value from other numbers. The cost is of course less flexibility if the format change, but at least it will fail right when it is needed. When I was writing the code this value changed right here multiple times depending on what I had implemented (and was ready to test) so it seemed better to keep it that way instead of tying it to other constants. I can define a local constant here if you don't like it. Randy Smith (Not in Mondays) 2013/12/02 21:48:06 Huh. I don't particularly care about future flexi Show quoted text On 2013/11/26 00:32:41, rvargas wrote: > On 2013/11/25 19:48:07, rdsmith wrote: > > What results in this 11? This seems like you're simply saying that you can't > > have a table_len greater than 32 bits, in which case don't you want to just > say > > that? (I.e. IIRTCC the 11 is dependent on a specific value of kBaseTableLen, > > and that dependency should be explicit in the DCHECK). > > This is just saying that the table can be doubled 11 times from the smaller > table to the biggest table. Yes, the value (11) is derived from kBaseTableLen > and maybe kMaxAddress or kCellAddressMask, but I thought it was clearer to have > the number of bits here (right when we are saying "how many extra bits can we > have") rather to come out with a probably cryptic formula that derives the > proper value from other numbers. The cost is of course less flexibility if the > format change, but at least it will fail right when it is needed. When I was > writing the code this value changed right here multiple times depending on what > I had implemented (and was ready to test) so it seemed better to keep it that > way instead of tying it to other constants. Huh. I don't particularly care about future flexibility (I mean, future flexibility is good, just not my highest current priority :-}). But from my perspective in reading the code, having the cryptic formulas calls out to the reader what the dependencies between the different constants are. I had been thinking of asking if we could have constants defined in disk_format_v3.h for the different pieces of the IndexCell, and then define the other constants that need to match those based on those constants. It means if you're tracing the "Why is this number this value?" question, you get roadsigns. Can you say more about why having an 11 is clearer? I'm not getting it. It doesn't feel like it tells the code reader/maintainer/debugger anything. Show quoted text > I can define a local constant here if you don't like it. I'm not interested in "const uint32 kExtraBitsLimit = 11;", if that's what you're offering; I don't find that any more useful than just using 11. But I'd be interested in your thoughts about my general argument above. rvargas (doing something else) 2013/12/04 01:04:17 My best example would be the failure of the formul Show quoted text On 2013/12/02 21:48:06, rdsmith wrote: > On 2013/11/26 00:32:41, rvargas wrote: > > On 2013/11/25 19:48:07, rdsmith wrote: > > > What results in this 11? This seems like you're simply saying that you > can't > > > have a table_len greater than 32 bits, in which case don't you want to just > > say > > > that? (I.e. IIRTCC the 11 is dependent on a specific value of > kBaseTableLen, > > > and that dependency should be explicit in the DCHECK). > > > > This is just saying that the table can be doubled 11 times from the smaller > > table to the biggest table. Yes, the value (11) is derived from kBaseTableLen > > and maybe kMaxAddress or kCellAddressMask, but I thought it was clearer to > have > > the number of bits here (right when we are saying "how many extra bits can we > > have") rather to come out with a probably cryptic formula that derives the > > proper value from other numbers. The cost is of course less flexibility if the > > format change, but at least it will fail right when it is needed. When I was > > writing the code this value changed right here multiple times depending on > what > > I had implemented (and was ready to test) so it seemed better to keep it that > > way instead of tying it to other constants. > > Huh. I don't particularly care about future flexibility (I mean, future > flexibility is good, just not my highest current priority :-}). But from my > perspective in reading the code, having the cryptic formulas calls out to the > reader what the dependencies between the different constants are. I had been > thinking of asking if we could have constants defined in disk_format_v3.h for > the different pieces of the IndexCell, and then define the other constants that > need to match those based on those constants. It means if you're tracing the > "Why is this number this value?" question, you get roadsigns. > > Can you say more about why having an 11 is clearer? I'm not getting it. It > doesn't feel like it tells the code reader/maintainer/debugger anything. > > > I can define a local constant here if you don't like it. > > I'm not interested in "const uint32 kExtraBitsLimit = 11;", if that's what > you're offering; I don't find that any more useful than just using 11. But I'd > be interested in your thoughts about my general argument above. > > My best example would be the failure of the formula two lines below this one. I can come up with something like the log of the largest table divided by the smallest table, but then that rises the issue of deriving the largest table. I hear what you are saying and I generally agree, to the point of having a bunch of constants with meaningful names whose values can be derived easily by just inspection (the reason for the constants here). In this particular case, we have a line that calculates a number from the comparison of the current table versus the smallest table. The first check says we cannot have a negative number (a table smaller than the smallest), the second line checks the value against an upper bound, and that should tell right away that we are checking for the largest possible table (right?) The only question is then why the largest table is 11 times larger than the smallest table, or in other words, what is the size of the largest table. Honestly, if I'm reading some code that I didn't write, I would not care much about the value of that number (the actual limit for the test): I would mostly care for some limit being there, either on a check, or in the code that uses that number. In this case, it would mean to me the same if that value is 8 or 15, or derived from a formula. 8 could be wrong, the formula could be off by one and in any case I would not spend time looking at the value unless I'm actually trying to debug a problem and I have reason to suspect that the max value could be at fault. And if I'm doing that, I would probably ignore the formula, figure out what the value should be and compare that against what the code says (number of bits on an address = 22, so 21 of them go to the main table; smaller table = 1024 (10 bits), so delta = 21 - 10) And that's why I prefer a plain number here rather than a formula... given that too much may be read from the formula (as in the check against 6). What I think it matters is not the value, but the check itself (double check an upper limit, whatever that is) I hope I don't sound combative, that's not my intention. I just want to answer the question before we setup in what to do. I'm up for moving relevant constants to the file format header. That's why kBaseTableLen is there. Shifts and masks I'm not so sure as they make more sense with the code that uses them, and I don't want to make them visible to any code outside this file. Randy Smith (Not in Mondays) 2013/12/05 19:17:48 Nope, you don't sound combative, and I appreciate Show quoted text On 2013/12/04 01:04:17, rvargas wrote: > On 2013/12/02 21:48:06, rdsmith wrote: > > On 2013/11/26 00:32:41, rvargas wrote: > > > On 2013/11/25 19:48:07, rdsmith wrote: > > > > What results in this 11? This seems like you're simply saying that you > > can't > > > > have a table_len greater than 32 bits, in which case don't you want to > just > > > say > > > > that? (I.e. IIRTCC the 11 is dependent on a specific value of > > kBaseTableLen, > > > > and that dependency should be explicit in the DCHECK). > > > > > > This is just saying that the table can be doubled 11 times from the smaller > > > table to the biggest table. Yes, the value (11) is derived from > kBaseTableLen > > > and maybe kMaxAddress or kCellAddressMask, but I thought it was clearer to > > have > > > the number of bits here (right when we are saying "how many extra bits can > we > > > have") rather to come out with a probably cryptic formula that derives the > > > proper value from other numbers. The cost is of course less flexibility if > the > > > format change, but at least it will fail right when it is needed. When I was > > > writing the code this value changed right here multiple times depending on > > what > > > I had implemented (and was ready to test) so it seemed better to keep it > that > > > way instead of tying it to other constants. > > > > Huh. I don't particularly care about future flexibility (I mean, future > > flexibility is good, just not my highest current priority :-}). But from my > > perspective in reading the code, having the cryptic formulas calls out to the > > reader what the dependencies between the different constants are. I had been > > thinking of asking if we could have constants defined in disk_format_v3.h for > > the different pieces of the IndexCell, and then define the other constants > that > > need to match those based on those constants. It means if you're tracing the > > "Why is this number this value?" question, you get roadsigns. > > > > Can you say more about why having an 11 is clearer? I'm not getting it. It > > doesn't feel like it tells the code reader/maintainer/debugger anything. > > > > > I can define a local constant here if you don't like it. > > > > I'm not interested in "const uint32 kExtraBitsLimit = 11;", if that's what > > you're offering; I don't find that any more useful than just using 11. But > I'd > > be interested in your thoughts about my general argument above. > > > > > > My best example would be the failure of the formula two lines below this one. > > I can come up with something like the log of the largest table divided by the > smallest table, but then that rises the issue of deriving the largest table. I > hear what you are saying and I generally agree, to the point of having a bunch > of constants with meaningful names whose values can be derived easily by just > inspection (the reason for the constants here). > > In this particular case, we have a line that calculates a number from the > comparison of the current table versus the smallest table. The first check says > we cannot have a negative number (a table smaller than the smallest), the second > line checks the value against an upper bound, and that should tell right away > that we are checking for the largest possible table (right?) > > The only question is then why the largest table is 11 times larger than the > smallest table, or in other words, what is the size of the largest table. > Honestly, if I'm reading some code that I didn't write, I would not care much > about the value of that number (the actual limit for the test): I would mostly > care for some limit being there, either on a check, or in the code that uses > that number. In this case, it would mean to me the same if that value is 8 or > 15, or derived from a formula. 8 could be wrong, the formula could be off by one > and in any case I would not spend time looking at the value unless I'm actually > trying to debug a problem and I have reason to suspect that the max value could > be at fault. And if I'm doing that, I would probably ignore the formula, figure > out what the value should be and compare that against what the code says (number > of bits on an address = 22, so 21 of them go to the main table; smaller table = > 1024 (10 bits), so delta = 21 - 10) > > And that's why I prefer a plain number here rather than a formula... given that > too much may be read from the formula (as in the check against 6). What I think > it matters is not the value, but the check itself (double check an upper limit, > whatever that is) > > I hope I don't sound combative, that's not my intention. I just want to answer > the question before we setup in what to do. Nope, you don't sound combative, and I appreciate the detailed explanation. Show quoted text > > I'm up for moving relevant constants to the file format header. That's why > kBaseTableLen is there. Shifts and masks I'm not so sure as they make more sense > with the code that uses them, and I don't want to make them visible to any code > outside this file. I think most of my feeling on this topic would be addressed by making explicit the relationships between the different constants--I don't have strong general feelings about either where the constants are declared or this particular location being a number rather than a constant 11. But I would like constants that are used in multiple places in the code and do have a relationship to each other to be expressed in terms of each other. So for instance, disk_format_v3.h would have k{SmallTable,}HashOffset and k{SmallTable,}HashLength, and kHash{SmallTable,}Shift and kCell{SmallTable,}HashMask would be defined from those values (probably in index_table.cc). Would you be good with that? Having said all that, the place I diverge from you in modeling this code being read by someone who doesn't know it is, if they suspected the max value would be at fault, the formula would give them a hint as to how that max was derived. That hint might be wrong, but it's structure that would lead them further into the code, and point at the logic that they would need to vet. But as I say, I more care about the relationship between the defined constants being clear than about this particular number being a formula. Randy Smith (Not in Mondays) 2013/12/26 21:45:49 Any thoughts on this request? An extreme example Show quoted text On 2013/12/05 19:17:48, rdsmith wrote: > On 2013/12/04 01:04:17, rvargas wrote: > > On 2013/12/02 21:48:06, rdsmith wrote: > > > On 2013/11/26 00:32:41, rvargas wrote: > > > > On 2013/11/25 19:48:07, rdsmith wrote: > > > > > What results in this 11? This seems like you're simply saying that you > > > can't > > > > > have a table_len greater than 32 bits, in which case don't you want to > > just > > > > say > > > > > that? (I.e. IIRTCC the 11 is dependent on a specific value of > > > kBaseTableLen, > > > > > and that dependency should be explicit in the DCHECK). > > > > > > > > This is just saying that the table can be doubled 11 times from the > smaller > > > > table to the biggest table. Yes, the value (11) is derived from > > kBaseTableLen > > > > and maybe kMaxAddress or kCellAddressMask, but I thought it was clearer to > > > have > > > > the number of bits here (right when we are saying "how many extra bits can > > we > > > > have") rather to come out with a probably cryptic formula that derives the > > > > proper value from other numbers. The cost is of course less flexibility if > > the > > > > format change, but at least it will fail right when it is needed. When I > was > > > > writing the code this value changed right here multiple times depending on > > > what > > > > I had implemented (and was ready to test) so it seemed better to keep it > > that > > > > way instead of tying it to other constants. > > > > > > Huh. I don't particularly care about future flexibility (I mean, future > > > flexibility is good, just not my highest current priority :-}). But from my > > > perspective in reading the code, having the cryptic formulas calls out to > the > > > reader what the dependencies between the different constants are. I had > been > > > thinking of asking if we could have constants defined in disk_format_v3.h > for > > > the different pieces of the IndexCell, and then define the other constants > > that > > > need to match those based on those constants. It means if you're tracing > the > > > "Why is this number this value?" question, you get roadsigns. > > > > > > Can you say more about why having an 11 is clearer? I'm not getting it. It > > > doesn't feel like it tells the code reader/maintainer/debugger anything. > > > > > > > I can define a local constant here if you don't like it. > > > > > > I'm not interested in "const uint32 kExtraBitsLimit = 11;", if that's what > > > you're offering; I don't find that any more useful than just using 11. But > > I'd > > > be interested in your thoughts about my general argument above. > > > > > > > > > > My best example would be the failure of the formula two lines below this one. > > > > I can come up with something like the log of the largest table divided by the > > smallest table, but then that rises the issue of deriving the largest table. I > > hear what you are saying and I generally agree, to the point of having a bunch > > of constants with meaningful names whose values can be derived easily by just > > inspection (the reason for the constants here). > > > > In this particular case, we have a line that calculates a number from the > > comparison of the current table versus the smallest table. The first check > says > > we cannot have a negative number (a table smaller than the smallest), the > second > > line checks the value against an upper bound, and that should tell right away > > that we are checking for the largest possible table (right?) > > > > The only question is then why the largest table is 11 times larger than the > > smallest table, or in other words, what is the size of the largest table. > > Honestly, if I'm reading some code that I didn't write, I would not care much > > about the value of that number (the actual limit for the test): I would mostly > > care for some limit being there, either on a check, or in the code that uses > > that number. In this case, it would mean to me the same if that value is 8 or > > 15, or derived from a formula. 8 could be wrong, the formula could be off by > one > > and in any case I would not spend time looking at the value unless I'm > actually > > trying to debug a problem and I have reason to suspect that the max value > could > > be at fault. And if I'm doing that, I would probably ignore the formula, > figure > > out what the value should be and compare that against what the code says > (number > > of bits on an address = 22, so 21 of them go to the main table; smaller table > = > > 1024 (10 bits), so delta = 21 - 10) > > > > And that's why I prefer a plain number here rather than a formula... given > that > > too much may be read from the formula (as in the check against 6). What I > think > > it matters is not the value, but the check itself (double check an upper > limit, > > whatever that is) > > > > I hope I don't sound combative, that's not my intention. I just want to answer > > the question before we setup in what to do. > > Nope, you don't sound combative, and I appreciate the detailed explanation. > > > > > I'm up for moving relevant constants to the file format header. That's why > > kBaseTableLen is there. Shifts and masks I'm not so sure as they make more > sense > > with the code that uses them, and I don't want to make them visible to any > code > > outside this file. > > I think most of my feeling on this topic would be addressed by making explicit > the relationships between the different constants--I don't have strong general > feelings about either where the constants are declared or this particular > location being a number rather than a constant 11. But I would like constants > that are used in multiple places in the code and do have a relationship to each > other to be expressed in terms of each other. So for instance, disk_format_v3.h > would have k{SmallTable,}HashOffset and k{SmallTable,}HashLength, and > kHash{SmallTable,}Shift and kCell{SmallTable,}HashMask would be defined from > those values (probably in index_table.cc). Would you be good with that? Any thoughts on this request? An extreme example that I don't think I've previously mentioned: kBaseTableLen is defined in three different places (backend_impl.cc, backend_impl_v3.cc, and backend_worker.cc, disk_format_v3.h), and the definition in the .h file looks to contradict the common definition in the .cc files. Show quoted text > > Having said all that, the place I diverge from you in modeling this code being > read by someone who doesn't know it is, if they suspected the max value would be > at fault, the formula would give them a hint as to how that max was derived. > That hint might be wrong, but it's structure that would lead them further into > the code, and point at the logic that they would need to vet. But as I say, I > more care about the relationship between the defined constants being clear than > about this particular number being a formula. rvargas (doing something else) 2013/12/27 19:31:46 I tried to move the conversation to the latest ver Show quoted text > > I think most of my feeling on this topic would be addressed by making explicit > > the relationships between the different constants--I don't have strong general > > feelings about either where the constants are declared or this particular > > location being a number rather than a constant 11. But I would like constants > > that are used in multiple places in the code and do have a relationship to > each > > other to be expressed in terms of each other. So for instance, > disk_format_v3.h > > would have k{SmallTable,}HashOffset and k{SmallTable,}HashLength, and > > kHash{SmallTable,}Shift and kCell{SmallTable,}HashMask would be defined from > > those values (probably in index_table.cc). Would you be good with that? > > Any thoughts on this request? I tried to move the conversation to the latest version of the code, where I added comments about it. Show quoted text > An extreme example that I don't think I've previously mentioned: kBaseTableLen > is defined in three different places (backend_impl.cc, backend_impl_v3.cc, and > backend_worker.cc, disk_format_v3.h), and the definition in the .h file looks to > contradict the common definition in the .cc files. That is just a transient detail... - backend_impl.cc: nothing to do about it (old format) - backend_impl_v3.cc: old, unused code - backend_worker.cc: either old, unused code, or using the value from disk_format_v3.h - disk_format_v3.h: definition for the new format.
	474 mask_ = ((kBaseTableLen / kCellsPerBucket) << extra_bits_) - 1;

	475 small_table_ = extra_bits_ < kHashShift - kHashSmallTableShift;
	Randy Smith (Not in Mondays) 2013/11/25 19:48:07 In other words, while the number of bits that are In other words, while the number of bits that are thrown away before making the comparison between the hash and the IndexCell partial hash is greater than the number of bits used to index into the table, leave the table small, but when we start using all those thrown-away bits and crowding into the comparison section, switch to the large table? (I'll note that this seems like a different metric for the small/large table distinction that I'd previously understood--I thought it was about whether or not the entries will always be stored in one file. But looking at disk_format_v3.h, it seems like the distinction is specified purely by the number of entries. If there's a subtle background connection between these three things, it should probably be made explicit, or if one of them isn't relevant, that should be made clear too. Based on how much pain it took me to figure out, I'll probably also want an explanatory comment about what hashes mean (in the different contexts) somewhere, but I'll ask for that once I'm confident I actually understand how they're used.) rvargas (doing something else) 2013/11/26 00:32:41 I didn't intend to make a strong statement about w Show quoted text On 2013/11/25 19:48:07, rdsmith wrote: > In other words, while the number of bits that are thrown away before making the > comparison between the hash and the IndexCell partial hash is greater than the > number of bits used to index into the table, leave the table small, but when we > start using all those thrown-away bits and crowding into the comparison section, > switch to the large table? I didn't intend to make a strong statement about what this means. It really just means extra_bits_ < 6, as in the table can be doubled 6 times before reaching the 64k border between the two formats. It's just that those two values are tightly related to each other, and the only difference between them is the table type (small or large), and subtracting them gives me the 6 that I need. Somewhat similar to the previous '11' but this time I'm tying it to other constants that change accordingly (and as far as I remember I use this same formula in other places). Show quoted text > > (I'll note that this seems like a different metric for the small/large table > distinction that I'd previously understood--I thought it was about whether or > not the entries will always be stored in one file. But looking at > disk_format_v3.h, it seems like the distinction is specified purely by the > number of entries. If there's a subtle background connection between these > three things, it should probably be made explicit, or if one of them isn't > relevant, that should be made clear too. The distinction between small and large is basically arbitrary. 64K is an interesting number, and it is the base table size of V2, so it was just natural to keep using it. A small table is a new feature of V3, so it covers tables smaller than 64K. The maximum number of blocks on a block file also happens to be 64k mostly because 64k is an interesting number (and it doesn't yield files that are too big with the largest block file), and it requires two standard pages (4k each) to hold the bitmap. That results in being able to store up to 64k entries on a single file (in fact, slightly less than that). And that just fits in that it allows the implied file feature of small tables, but it is not the cause of the border (although it would be hard to move that border somewhere else). So things fit, but the "real" distinction between small and large tables is arbitrarily a 64k table. That's why some numbers are really "count how many times we can do foo", given the arbitrary limits of the format (start at 1k, switch at 64k, up to 22 bits) Show quoted text > > Based on how much pain it took me to figure out, I'll probably also want an > explanatory comment about what hashes mean (in the different contexts) > somewhere, but I'll ask for that once I'm confident I actually understand how > they're used.) Randy Smith (Not in Mondays) 2013/12/02 21:48:06 This makes sense, but mostly comes down to my want Show quoted text On 2013/11/26 00:32:41, rvargas wrote: > On 2013/11/25 19:48:07, rdsmith wrote: > > In other words, while the number of bits that are thrown away before making > the > > comparison between the hash and the IndexCell partial hash is greater than the > > number of bits used to index into the table, leave the table small, but when > we > > start using all those thrown-away bits and crowding into the comparison > section, > > switch to the large table? > > I didn't intend to make a strong statement about what this means. It really just > means extra_bits_ < 6, as in the table can be doubled 6 times before reaching > the 64k border between the two formats. > > It's just that those two values are tightly related to each other, and the only > difference between them is the table type (small or large), and subtracting them > gives me the 6 that I need. Somewhat similar to the previous '11' but this time > I'm tying it to other constants that change accordingly (and as far as I > remember I use this same formula in other places). > > > > > > (I'll note that this seems like a different metric for the small/large table > > distinction that I'd previously understood--I thought it was about whether or > > not the entries will always be stored in one file. But looking at > > disk_format_v3.h, it seems like the distinction is specified purely by the > > number of entries. If there's a subtle background connection between these > > three things, it should probably be made explicit, or if one of them isn't > > relevant, that should be made clear too. > > The distinction between small and large is basically arbitrary. 64K is an > interesting number, and it is the base table size of V2, so it was just natural > to keep using it. A small table is a new feature of V3, so it covers tables > smaller than 64K. > > The maximum number of blocks on a block file also happens to be 64k mostly > because 64k is an interesting number (and it doesn't yield files that are too > big with the largest block file), and it requires two standard pages (4k each) > to hold the bitmap. > > That results in being able to store up to 64k entries on a single file (in fact, > slightly less than that). And that just fits in that it allows the implied file > feature of small tables, but it is not the cause of the border (although it > would be hard to move that border somewhere else). > > So things fit, but the "real" distinction between small and large tables is > arbitrarily a 64k table. That's why some numbers are really "count how many > times we can do foo", given the arbitrary limits of the format (start at 1k, > switch at 64k, up to 22 bits) > > > > > Based on how much pain it took me to figure out, I'll probably also want an > > explanatory comment about what hashes mean (in the different contexts) > > somewhere, but I'll ask for that once I'm confident I actually understand how > > they're used.) > This makes sense, but mostly comes down to my wanting to be able to trace dependencies between constants/distinctions. I.e. if the defining difference between a small and a large table is the 64K boundary, I'd love to have (e.g.) a constant somewhere kMaxSmallTableSize = 64K, and have (e.g.) the difference between kHashShift and kHashSmallTable shift defined in terms of it. (I.e. this is the same comment as the previous.)
	476 if (!small_table_)

	477 extra_bits_ -= kHashShift - kHashSmallTableShift;

	478

	479 // table_len keeps the max number of cells stored by the index. We need a

	480 // bitmap with 1 bit per cell, and that bitmap has num_words 32-bit words.

	481 int num_words = (header_->table_len + 31) / 32;

	482

	483 if (old_extra_table) {

	484 // All the cells from the extra table are moving to the new tables so before

	485 // creating the bitmaps, clear the part of the extra table.
	Randy Smith (Not in Mondays) 2013/11/25 19:48:07 nit, suggestion: "clear the part of the bitmap ref nit, suggestion: "clear the part of the bitmap referring to the extra table"? Randy Smith (Not in Mondays) 2013/12/02 21:48:06 Done. Show quoted text On 2013/11/25 19:48:07, rdsmith wrote: > nit, suggestion: "clear the part of the bitmap referring to the extra table"? Done.
	486 int main_table_bit_words = ((mask_ >> 1) + 1) * kCellsPerBucket / 32;

	487 DCHECK_GT(num_words, main_table_bit_words);

	488 memset(params->index_bitmap->bitmap + main_table_bit_words, 0,

	489 (num_words - main_table_bit_words) * sizeof(int32));

	490

	491 DCHECK(growing);

	492 int old_num_words = (backup_header_.get()->table_len + 31) / 32;

	493 DCHECK_GT(old_num_words, main_table_bit_words);

	494 memset(backup_bitmap_storage_.get() + main_table_bit_words, 0,

	495 (old_num_words - main_table_bit_words) * sizeof(int32));

	496 }

	497 bitmap_.reset(new Bitmap(params->index_bitmap->bitmap, header_->table_len,

	498 num_words));

	499

	500 if (growing) {

	501 int old_num_words = (backup_header_.get()->table_len + 31) / 32;

	502 DCHECK_GE(num_words, old_num_words);
	Randy Smith (Not in Mondays) 2013/11/25 19:48:07 Under what circumstances will they be equal? Does Under what circumstances will they be equal? Does that imply that we could be doubling table size in a situation where the extra_table is the same size as the added main table size? rvargas (doing something else) 2013/11/26 00:32:41 Mostly a check for not getting a negative index be Show quoted text On 2013/11/25 19:48:07, rdsmith wrote: > Under what circumstances will they be equal? Does that imply that we could be > doubling table size in a situation where the extra_table is the same size as the > added main table size? Mostly a check for not getting a negative index below. We could be doing that, or maybe growing the extra table by 16 entries (so that the number of words doesn't change). Backend tests grow the table really slowly :)
	503 scoped_ptr<uint32[]> storage(new uint32[num_words]);

	504 memcpy(storage.get(), backup_bitmap_storage_.get(),

	505 old_num_words * sizeof(int32));

	506 memset(storage.get() + old_num_words, 0,

	507 (num_words - old_num_words) * sizeof(int32));

	508

	509 backup_bitmap_storage_.swap(storage);

	510 backup_header_->table_len = header_->table_len;

	511 } else {

	512 backup_bitmap_storage_.reset(params->backup_bitmap.release());

	513 backup_header_.reset(params->backup_header.release());

	514 }

	515

	516 num_words = (backup_header_->table_len + 31) / 32;

	517 backup_bitmap_.reset(new Bitmap(backup_bitmap_storage_.get(),

	518 backup_header_->table_len, num_words));

	519 if (old_extra_table)

	520 MoveCells(old_extra_table.get());

	521

	522 if (small_table_)

	523 DCHECK(header_->flags & SMALL_CACHE);

	524 }

	525

	526 void IndexTable::Reset() {

	527 header_ = NULL;

	528 main_table_ = NULL;

	529 extra_table_ = NULL;

	530 bitmap_.reset();

	531 backup_bitmap_.reset();

	532 backup_header_.reset();

	533 backup_bitmap_storage_.reset();

	534 modified_ = false;

	535 }

	536

	537 // The general method for locating cells is to:

	538 // 1. Get the first bucket. This usually means directly indexing the table (as

	539 // this method does), or iterating through all possible buckets.

	540 // 2. Iterate through all the cells in that first bucket.

	541 // 3. If there is a linked bucket, locate it directly in the extra table.

	542 // 4. Go back to 2, as needed.

	543 //

	544 // One consequence of this pattern is that we never start looking at buckets in

	545 // the extra table, unless we are following a link from the main table.

	546 EntrySet IndexTable::LookupEntries(uint32 hash) {

	547 EntrySet entries;

	548 int bucket_id = static_cast<int>(hash & mask_);

	549 IndexBucket* bucket = &main_table_[bucket_id];

	550 for (;;) {

	551 for (int i = 0; i < kCellsPerBucket; i++) {

	552 IndexCell* current_cell = &bucket->cells[i];

	553 if (!GetAddressValue(*current_cell))

	554 continue;

	555 if (!SanityCheck(*current_cell)) {

	556 NOTREACHED();

	557 int cell_id = bucket_id * kCellsPerBucket + i;

	558 current_cell->Clear();

	559 bitmap_->Set(cell_id, false);

	560 backup_bitmap_->Set(cell_id, false);

	561 modified_ = true;

	562 continue;

	563 }

	564 int cell_id = bucket_id * kCellsPerBucket + i;

	565 if (MisplacedHash(*current_cell, hash)) {

	566 HandleMisplacedCell(current_cell, cell_id, hash & mask_);

	567 } else if (IsHashMatch(*current_cell, hash)) {

	568 EntryCell entry_cell(cell_id, hash, *current_cell, small_table_);

	569 CheckState(entry_cell);

	570 if (entry_cell.GetState() != ENTRY_DELETED) {

	571 entries.cells.push_back(entry_cell);

	572 if (entry_cell.GetGroup() == ENTRY_EVICTED)

	573 entries.evicted_count++;

	574 }

	575 }

	576 }

	577 bucket_id = GetNextBucket(mask_ + 1, header()->max_bucket, extra_table_,

	578 &bucket);

	579 if (!bucket_id)

	580 break;

	581 }

	582 return entries;

	583 }

	584

	585 EntryCell IndexTable::CreateEntryCell(uint32 hash, Addr address) {

	586 DCHECK(IsValidAddress(address));

	587 DCHECK(address.ToIndexEntryAddress());

	588

	589 int bucket_id = static_cast<int>(hash & mask_);

	590 int cell_id = 0;

	591 IndexBucket* bucket = &main_table_[bucket_id];

	592 IndexCell* current_cell = NULL;

	593 bool found = false;

	594 for (; !found;) {

	595 for (int i = 0; i < kCellsPerBucket && !found; i++) {

	596 current_cell = &bucket->cells[i];

	597 if (!GetAddressValue(*current_cell)) {

	598 cell_id = bucket_id * kCellsPerBucket + i;

	599 found = true;

	600 }

	601 }

	602 if (found)

	603 break;

	604 bucket_id = GetNextBucket(mask_ + 1, header()->max_bucket, extra_table_,

	605 &bucket);

	606 if (!bucket_id)

	607 break;

	608 }

	609

	610 if (!found) {

	611 bucket_id = NewExtraBucket();

	612 if (bucket_id) {

	613 cell_id = bucket_id * kCellsPerBucket;

	614 bucket->next = cell_id;

	615 bucket = &extra_table_[bucket_id - (mask_ + 1)];

	616 bucket->hash = hash & mask_;

	617 found = true;

	618 } else {

	619 // address 0 is a reserved value, and the caller interprets it as invalid.

	620 address.set_value(0);

	621 }

	622 }

	623

	624 EntryCell entry_cell(cell_id, hash, address, small_table_);

	625 if (address.file_type() == BLOCK_EVICTED)

	626 entry_cell.SetGroup(ENTRY_EVICTED);

	627 else

	628 entry_cell.SetGroup(ENTRY_NO_USE);

	629 Save(&entry_cell);

	630

	631 if (found) {

	632 bitmap_->Set(cell_id, true);

	633 backup_bitmap_->Set(cell_id, true);

	634 header()->used_cells++;

	635 modified_ = true;

	636 }

	637

	638 return entry_cell;

	639 }

	640

	641 EntryCell IndexTable::FindEntryCell(uint32 hash, Addr address) {

	642 return FindEntryCellImpl(hash, address, false);

	643 }

	644

	645 int IndexTable::CalculateTimestamp(Time time) {

	646 TimeDelta delta = time - Time::FromInternalValue(header_->base_time);

	647 return std::max(delta.InMinutes(), 0);

	648 }

	649

	650 base::Time IndexTable::TimeFromTimestamp(int timestamp) {

	651 return Time::FromInternalValue(header_->base_time) +

	652 TimeDelta::FromMinutes(timestamp);

	653 }

	654

	655 void IndexTable::SetSate(uint32 hash, Addr address, EntryState state) {

	656 EntryCell cell = FindEntryCellImpl(hash, address, state == ENTRY_FREE);

	657 if (!cell.IsValid()) {

	658 NOTREACHED();

	659 return;

	660 }

	661

	662 EntryState old_state = cell.GetState();

	663 if (state == ENTRY_FREE) {

	664 DCHECK_EQ(old_state, ENTRY_DELETED);

	665 } else if (state == ENTRY_NEW) {

	666 DCHECK_EQ(old_state, ENTRY_FREE);

	667 } else if (state == ENTRY_OPEN) {

	668 DCHECK_EQ(old_state, ENTRY_USED);

	669 } else if (state == ENTRY_MODIFIED) {

	670 DCHECK_EQ(old_state, ENTRY_OPEN);

	671 } else if (state == ENTRY_DELETED) {

	672 DCHECK(old_state == ENTRY_NEW \|\| old_state == ENTRY_OPEN \|\|

	673 old_state == ENTRY_MODIFIED);

	674 } else if (state == ENTRY_USED) {

	675 DCHECK(old_state == ENTRY_NEW \|\| old_state == ENTRY_OPEN \|\|

	676 old_state == ENTRY_MODIFIED);

	677 }

	678

	679 modified_ = true;

	680 if (state == ENTRY_DELETED) {

	681 bitmap_->Set(cell.cell_id(), false);

	682 backup_bitmap_->Set(cell.cell_id(), false);

	683 } else if (state == ENTRY_FREE) {

	684 cell.Clear();

	685 Write(cell);

	686 header()->used_cells--;

	687 return;

	688 }

	689 cell.SetState(state);

	690

	691 Save(&cell);

	692 }

	693

	694 void IndexTable::UpdateTime(uint32 hash, Addr address, base::Time current) {

	695 EntryCell cell = FindEntryCell(hash, address);

	696 if (!cell.IsValid())

	697 return;

	698

	699 int minutes = CalculateTimestamp(current);

	700

	701 // Keep about 3 months of headroom.

	702 const int kMaxTimestamp = (1 << 20) - 60 * 24 * 90;

	703 if (minutes > kMaxTimestamp) {

	704 // TODO(rvargas):

	705 // Update header->old_time and trigger a timer

	706 // Rebaseline timestamps and don't update sums

	707 // Start a timer (about 2 backups)

	708 // fix all ckecksums and trigger another timer

	709 // update header->old_time because rebaseline is done.

	710 minutes = std::min(minutes, (1 << 20) - 1);

	711 }

	712

	713 cell.SetTimestamp(minutes);

	714 Save(&cell);

	715 }

	716

	717 void IndexTable::Save(EntryCell* cell) {

	718 cell->FixSum();

	719 Write(*cell);

	720 }

	721

	722 void IndexTable::GetOldest(IndexIterator* no_use,

	723 IndexIterator* low_use,

	724 IndexIterator* high_use) {

	725 no_use->forward = true;

	726 low_use->forward = true;

	727 high_use->forward = true;

	728 InitIterator(no_use);

	729 InitIterator(low_use);

	730 InitIterator(high_use);

	731

	732 WalkTables(-1, no_use, low_use, high_use);

	733 }

	734

	735 bool IndexTable::GetNextCells(IndexIterator* iterator) {

	736 int current_time = iterator->timestamp;

	737 InitIterator(iterator);

	738

	739 WalkTables(current_time, iterator, iterator, iterator);

	740 return !iterator->cells.empty();

	741 }

	742

	743 void IndexTable::OnBackupTimer() {

	744 if (!modified_)

	745 return;

	746

	747 int num_words = (header_->table_len + 31) / 32;

	748 int num_bytes = num_words * 4 + static_cast<int>(sizeof(*header_));

	749 scoped_refptr<net::IOBuffer> buffer(new net::IOBuffer(num_bytes));

	750 memcpy(buffer->data(), header_, sizeof(*header_));

	751 memcpy(buffer->data() + sizeof(*header_), backup_bitmap_storage_.get(),

	752 num_words * 4);

	753 backend_->SaveIndex(buffer, num_bytes);

	754 modified_ = false;

	755 }

	756

	757 // -----------------------------------------------------------------------

	758

	759 EntryCell IndexTable::FindEntryCellImpl(uint32 hash, Addr address,

	760 bool allow_deleted) {

	761 int bucket_id = static_cast<int>(hash & mask_);

	762 IndexBucket* bucket = &main_table_[bucket_id];

	763 for (;;) {

	764 for (int i = 0; i < kCellsPerBucket; i++) {

	765 IndexCell* current_cell = &bucket->cells[i];

	766 if (!GetAddressValue(*current_cell))

	767 continue;

	768 DCHECK(SanityCheck(*current_cell));

	769 if (IsHashMatch(*current_cell, hash)) {

	770 // We have a match.

	771 int cell_id = bucket_id * kCellsPerBucket + i;

	772 EntryCell entry_cell(cell_id, hash, *current_cell, small_table_);

	773 if (entry_cell.GetAddress() != address)

	774 continue;

	775

	776 if (!allow_deleted && entry_cell.GetState() == ENTRY_DELETED)

	777 continue;

	778

	779 return entry_cell;

	780 }

	781 }

	782 bucket_id = GetNextBucket(mask_ + 1, header()->max_bucket, extra_table_,

	783 &bucket);

	784 if (!bucket_id)

	785 break;

	786 }

	787 return EntryCell();

	788 }

	789

	790 void IndexTable::CheckState(const EntryCell& cell) {

	791 int current_state = cell.GetState();

	792 if (current_state != ENTRY_FIXING) {

	793 bool present = ((current_state & 3) != 0); // Look at the last two bits.

	794 if (present != bitmap_->Get(cell.cell_id()) \|\|

	795 present != backup_bitmap_->Get(cell.cell_id())) {

	796 // There's a mismatch.

	797 if (current_state == ENTRY_DELETED) {

	798 // We were in the process of deleting this entry. Finish now.

	799 backend_->DeleteCell(cell);

	800 } else {

	801 current_state = ENTRY_FIXING;

	802 EntryCell bad_cell(cell);

	803 bad_cell.SetState(ENTRY_FIXING);

	804 Save(&bad_cell);

	805 }

	806 }

	807 }

	808

	809 if (current_state == ENTRY_FIXING)

	810 backend_->FixCell(cell);

	811 }

	812

	813 void IndexTable::Write(const EntryCell& cell) {

	814 IndexBucket* bucket = NULL;

	815 int bucket_id = cell.cell_id() / kCellsPerBucket;

	816 if (bucket_id < static_cast<int32>(mask_ + 1)) {

	817 bucket = &main_table_[bucket_id];

	818 } else {

	819 DCHECK_LE(bucket_id, header()->max_bucket);

	820 bucket = &extra_table_[bucket_id - (mask_ + 1)];

	821 }

	822

	823 int cell_number = cell.cell_id() % kCellsPerBucket;

	824 if (GetAddressValue(bucket->cells[cell_number]) && cell.GetAddressValue()) {

	825 DCHECK_EQ(cell.GetAddressValue(),

	826 GetAddressValue(bucket->cells[cell_number]));

	827 }

	828 cell.Serialize(&bucket->cells[cell_number]);

	829 }

	830

	831 int IndexTable::NewExtraBucket() {

	832 int safe_window = (header()->table_len < kNumExtraBlocks * 2) ?

	833 kNumExtraBlocks / 4 : kNumExtraBlocks;

	834 if (header()->table_len - header()->max_bucket * kCellsPerBucket <

	835 safe_window) {

	836 backend_->GrowIndex();

	837 }

	838

	839 if (header()->max_bucket * kCellsPerBucket ==

	840 header()->table_len - kCellsPerBucket) {

	841 return 0;

	842 }

	843

	844 header()->max_bucket++;

	845 return header()->max_bucket;

	846 }

	847

	848 void IndexTable::WalkTables(int limit_time,

	849 IndexIterator* no_use,

	850 IndexIterator* low_use,

	851 IndexIterator* high_use) {

	852 header_->num_no_use_entries = 0;

	853 header_->num_low_use_entries = 0;

	854 header_->num_high_use_entries = 0;

	855 header_->num_evicted_entries = 0;

	856

	857 for (int i = 0; i < static_cast<int32>(mask_ + 1); i++) {

	858 int bucket_id = i;

	859 IndexBucket* bucket = &main_table_[i];

	860 for (;;) {

	861 UpdateFromBucket(bucket, i, limit_time, no_use, low_use, high_use);

	862

	863 bucket_id = GetNextBucket(mask_ + 1, header()->max_bucket, extra_table_,

	864 &bucket);

	865 if (!bucket_id)

	866 break;

	867 }

	868 }

	869 header_->num_entries = header_->num_no_use_entries +

	870 header_->num_low_use_entries +

	871 header_->num_high_use_entries +

	872 header_->num_evicted_entries;

	873 modified_ = true;

	874 }

	875

	876 void IndexTable::UpdateFromBucket(IndexBucket* bucket, int bucket_hash,

	877 int limit_time,

	878 IndexIterator* no_use,

	879 IndexIterator* low_use,

	880 IndexIterator* high_use) {

	881 for (int i = 0; i < kCellsPerBucket; i++) {

	882 IndexCell& current_cell = bucket->cells[i];

	883 if (!GetAddressValue(current_cell))

	884 continue;

	885 DCHECK(SanityCheck(current_cell));

	886 if (!IsNormalState(current_cell))

	887 continue;

	888

	889 EntryCell entry_cell(0, GetFullHash(current_cell, bucket_hash),

	890 current_cell, small_table_);

	891 switch (GetCellGroup(current_cell)) {

	892 case ENTRY_NO_USE:

	893 UpdateIterator(entry_cell, limit_time, no_use);

	894 header_->num_no_use_entries++;

	895 break;

	896 case ENTRY_LOW_USE:

	897 UpdateIterator(entry_cell, limit_time, low_use);

	898 header_->num_low_use_entries++;

	899 break;

	900 case ENTRY_HIGH_USE:

	901 UpdateIterator(entry_cell, limit_time, high_use);

	902 header_->num_high_use_entries++;

	903 break;

	904 case ENTRY_EVICTED:

	905 header_->num_evicted_entries++;

	906 break;

	907 default:

	908 NOTREACHED();

	909 }

	910 }

	911 }

	912

	913 void IndexTable::MoveCells(IndexBucket* old_extra_table) {

	914 int max_hash = (mask_ + 1) / 2;

	915 int max_bucket = header()->max_bucket;

	916 header()->max_bucket = mask_;

	917 int used_cells = header()->used_cells;

	918

	919 // Consider a large cache: a cell stores the upper 18 bits of the hash

	920 // (h >> 14). If the table is say 8 times the original size (growing from 4x),

	921 // the bit that we are interested in would be the 3rd bit of the stored value,

	922 // in other words 'multiplier' >> 1.

	923 uint32 new_bit = (1 << extra_bits_) >> 1;

	924

	925 scoped_ptr<IndexBucket[]> old_main_table;

	926 IndexBucket* source_table = main_table_;

	927 bool upgrade_format = !extra_bits_;

	928 if (upgrade_format) {

	929 // This method should deal with migrating a small table to a big one. Given

	930 // that the first thing to do is read the old table, set small_table_ for

	931 // the size of the old table. Now, when moving a cell, the result cannot be

	932 // placed in the old table or we will end up reading it again and attempting

	933 // to move it, so we have to copy the whole table at once.

	934 DCHECK(!small_table_);

	935 small_table_ = true;

	936 old_main_table.reset(new IndexBucket[max_hash]);

	937 memcpy(old_main_table.get(), main_table_, max_hash * sizeof(IndexBucket));

	938 memset(main_table_, 0, max_hash * sizeof(IndexBucket));

	939 source_table = old_main_table.get();

	940 }

	941

	942 for (int i = 0; i < max_hash; i++) {

	943 int bucket_id = i;

	944 IndexBucket* bucket = &source_table[i];

	945 for (;;) {

	946 for (int j = 0; j < kCellsPerBucket; j++) {

	947 IndexCell& current_cell = bucket->cells[j];

	948 if (!GetAddressValue(current_cell))

	949 continue;

	950 DCHECK(SanityCheck(current_cell));

	951 if (bucket_id == i) {

	952 if (upgrade_format \|\| (GetHashValue(current_cell) & new_bit)) {

	953 // Move this cell to the upper half of the table.

	954 MoveSingleCell(&current_cell, bucket_id * kCellsPerBucket + j, i,

	955 true);

	956 }

	957 } else {

	958 // All cells on extra buckets have to move.

	959 MoveSingleCell(&current_cell, bucket_id * kCellsPerBucket + j, i,

	960 true);

	961 }

	962 }

	963

	964 bucket_id = GetNextBucket(max_hash, max_bucket, old_extra_table, &bucket);

	965 if (!bucket_id)

	966 break;

	967 }

	968 }

	969

	970 DCHECK_EQ(header()->used_cells, used_cells);

	971

	972 if (upgrade_format) {

	973 small_table_ = false;

	974 header()->flags &= ~SMALL_CACHE;

	975 }

	976 }

	977

	978 void IndexTable::MoveSingleCell(IndexCell* current_cell, int cell_id,

	979 int main_table_index, bool growing) {

	980 uint32 hash = GetFullHash(*current_cell, main_table_index);

	981 EntryCell old_cell(cell_id, hash, *current_cell, small_table_);

	982

	983 bool upgrade_format = !extra_bits_ && growing;

	984 if (upgrade_format)

	985 small_table_ = false;

	986 EntryCell new_cell = CreateEntryCell(hash, old_cell.GetAddress());

	987

	988 if (!new_cell.IsValid()) {

	989 // We'll deal with this entry later.

	990 if (upgrade_format)

	991 small_table_ = true;

	992 return;

	993 }

	994

	995 new_cell.SetState(old_cell.GetState());

	996 new_cell.SetGroup(old_cell.GetGroup());

	997 new_cell.SetReuse(old_cell.GetReuse());

	998 new_cell.SetTimestamp(old_cell.GetTimestamp());

	999 Save(&new_cell);

	1000 modified_ = true;

	1001 if (upgrade_format)

	1002 small_table_ = true;

	1003

	1004 if (old_cell.GetState() == ENTRY_DELETED) {

	1005 bitmap_->Set(new_cell.cell_id(), false);

	1006 backup_bitmap_->Set(new_cell.cell_id(), false);

	1007 }

	1008

	1009 if (!growing \|\| cell_id / kCellsPerBucket == main_table_index) {

	1010 // Only delete entries that live on the main table.

	1011 if (!upgrade_format) {

	1012 old_cell.Clear();

	1013 Write(old_cell);

	1014 }

	1015

	1016 if (cell_id != new_cell.cell_id()) {

	1017 bitmap_->Set(old_cell.cell_id(), false);

	1018 backup_bitmap_->Set(old_cell.cell_id(), false);

	1019 }

	1020 }

	1021 header()->used_cells--;

	1022 }

	1023

	1024 void IndexTable::HandleMisplacedCell(IndexCell* current_cell, int cell_id,

	1025 int main_table_index) {

	1026 // The cell may be misplaced, or a duplicate cell exists with this data.

	1027 uint32 hash = GetFullHash(*current_cell, main_table_index);

	1028 MoveSingleCell(current_cell, cell_id, main_table_index, false);

	1029

	1030 // Now look for a duplicate cell.

	1031 CheckBucketList(hash & mask_);

	1032 }

	1033

	1034 void IndexTable::CheckBucketList(int bucket_id) {

	1035 typedef std::pair<int, EntryGroup> AddressAndGroup;

	1036 std::set<AddressAndGroup> entries;

	1037 IndexBucket* bucket = &main_table_[bucket_id];

	1038 int bucket_hash = bucket_id;

	1039 for (;;) {

	1040 for (int i = 0; i < kCellsPerBucket; i++) {

	1041 IndexCell* current_cell = &bucket->cells[i];

	1042 if (!GetAddressValue(*current_cell))

	1043 continue;

	1044 if (!SanityCheck(*current_cell)) {

	1045 NOTREACHED();

	1046 current_cell->Clear();

	1047 continue;

	1048 }

	1049 int cell_id = bucket_id * kCellsPerBucket + i;

	1050 EntryCell cell(cell_id, GetFullHash(*current_cell, bucket_hash),

	1051 *current_cell, small_table_);

	1052 if (!entries.insert(std::make_pair(cell.GetAddress().value(),

	1053 cell.GetGroup())).second) {

	1054 current_cell->Clear();

	1055 continue;

	1056 }

	1057 CheckState(cell);

	1058 }

	1059

	1060 bucket_id = GetNextBucket(mask_ + 1, header()->max_bucket, extra_table_,

	1061 &bucket);

	1062 if (!bucket_id)

	1063 break;

	1064 }

	1065 }

	1066

	1067 uint32 IndexTable::GetAddressValue(const IndexCell& cell) {

	1068 if (small_table_)

	1069 return GetCellSmallTableAddress(cell);

	1070

	1071 return GetCellAddress(cell);

	1072 }

	1073

	1074 uint32 IndexTable::GetHashValue(const IndexCell& cell) {

	1075 if (small_table_)

	1076 return GetCellSmallTableHash(cell);

	1077

	1078 return GetCellHash(cell);

	1079 }

	1080

	1081 uint32 IndexTable::GetFullHash(const IndexCell& cell, uint32 lower_part) {

	1082 // It is OK for the high order bits of lower_part to overlap with the stored

	1083 // part of the hash.

	1084 if (small_table_)

	1085 return (GetCellSmallTableHash(cell) << kHashSmallTableShift) \| lower_part;

	1086

	1087 return (GetCellHash(cell) << kHashShift) \| lower_part;

	1088 }

	1089

	1090 // All the bits stored in the cell should match the provided hash.

	1091 bool IndexTable::IsHashMatch(const IndexCell& cell, uint32 hash) {

	1092 hash = small_table_ ? hash >> kHashSmallTableShift : hash >> kHashShift;

	1093 return GetHashValue(cell) == hash;

	1094 }

	1095

	1096 bool IndexTable::MisplacedHash(const IndexCell& cell, uint32 hash) {

	1097 if (!extra_bits_)

	1098 return false;

	1099

	1100 uint32 mask = (1 << extra_bits_) - 1;

	1101 hash = small_table_ ? hash >> kHashSmallTableShift : hash >> kHashShift;

	1102 return (GetHashValue(cell) & mask) != (hash & mask);

	1103 }

	1104

	1105 } // namespace disk_cache

OLD	NEW

« net/disk_cache/v3/index_table.h ('K') | « net/disk_cache/v3/index_table.h ('k') | net/disk_cache/v3/index_table_unittest.cc » ('j') | no next file with comments »