chrome/browser/mork_reader.cc - Issue 3035: Move importer files into an importer subdirectory.

Side by Side Diff: chrome/browser/mork_reader.cc

Issue 3035: Move importer files into an importer subdirectory. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 12 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */

2 /* *** BEGIN LICENSE BLOCK ***

3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1

4 *

5 * The contents of this file are subject to the Mozilla Public License Version

6 * 1.1 (the "License"); you may not use this file except in compliance with

7 * the License. You may obtain a copy of the License at

8 * http://www.mozilla.org/MPL/

9 *

10 * Software distributed under the License is distributed on an "AS IS" basis,

11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License

12 * for the specific language governing rights and limitations under the

13 * License.

14 *

15 * The Original Code is the Mork Reader.

16 *

17 * The Initial Developer of the Original Code is

18 * Google Inc.

19 * Portions created by the Initial Developer are Copyright (C) 2006

20 * the Initial Developer. All Rights Reserved.

21 *

22 * Contributor(s):

23 * Brian Ryner <bryner@brianryner.com> (original author)

24 *

25 * Alternatively, the contents of this file may be used under the terms of

26 * either the GNU General Public License Version 2 or later (the "GPL"), or

27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),

28 * in which case the provisions of the GPL or the LGPL are applicable instead

29 * of those above. If you wish to allow use of your version of this file only

30 * under the terms of either the GPL or the LGPL, and not to allow others to

31 * use your version of this file under the terms of the MPL, indicate your

32 * decision by deleting the provisions above and replace them with the notice

33 * and other provisions required by the GPL or the LGPL. If you do not delete

34 * the provisions above, a recipient may use your version of this file under

35 * the terms of any one of the MPL, the GPL or the LGPL.

36 *

37 * *** END LICENSE BLOCK *** */

38

39 // Source:

40 // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp

41 // This file has been converted to google style.

42

43 #include "chrome/browser/mork_reader.h"

44

45 #include <algorithm>

46

47 #include "base/logging.h"

48 #include "base/string_util.h"

49 #include "chrome/browser/firefox_importer_utils.h"

50 #include "chrome/browser/history/history_types.h"

51

52 namespace {

53

54 // Convert a hex character (0-9, A-F) to its corresponding byte value.

55 // Returns -1 if the character is invalid.

56 inline int HexCharToInt(char c) {

57 if ('0' <= c && c <= '9')

58 return c - '0';

59 if ('A' <= c && c <= 'F')

60 return c - 'A' + 10;

61 return -1;

62 }

63

64 // Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII

65 // characters. Additionally, '$' and '\' are backslash-escaped.

66 // The result of the unescape is in returned.

67 std::string MorkUnescape(const std::string& input) {

68 // We optimize for speed over space here -- size the result buffer to

69 // the size of the source, which is an upper bound on the size of the

70 // unescaped string.

71 std::string result;

72 size_t input_length = input.size();

73 result.reserve(input_length);

74

75 for (size_t i = 0; i < input_length; i++) {

76 char c = input[i];

77 if (c == '\\') {

78 // Escaped literal, slip the backslash, append the next character.

79 i++;

80 if (i < input_length)

81 result.push_back(input[i]);

82 } else if (c == '$') {

83 // Dollar sign denotes a hex character.

84 if (i < input_length - 2) {

85 // Would be nice to use ToInteger() here, but it currently

86 // requires a null-terminated string.

87 int first = HexCharToInt(input[++i]);

88 int second = HexCharToInt(input[++i]);

89 if (first >= 0 && second >= 0)

90 result.push_back((first << 4) \| second);

91 }

92 } else {

93 // Regular character, just append.

94 result.push_back(input[i]);

95 }

96 }

97 return result;

98 }

99

100 } // namespace

101

102 MorkReader::MorkReader() {

103 }

104

105 MorkReader::~MorkReader() {

106 // Need to delete all the pointers to vectors we have in the table.

107 for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)

108 delete i->second;

109 }

110

111 bool MorkReader::Read(const std::wstring& filename) {

112 stream_.open(filename.c_str());

113 if (!stream_.is_open())

114 return false;

115

116 std::string line;

117 if (!ReadLine(&line) \|\|

118 line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)

119 return false; // Unexpected file format.

120

121 IndexMap column_map;

122 while (ReadLine(&line)) {

123 // Trim off leading spaces

124 size_t idx = 0;

125 size_t len = line.size();

126 while (idx < len && line[idx] == ' ')

127 ++idx;

128 if (idx >= len)

129 continue;

130

131 // Look at the line to figure out what section type this is

132 if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {

133 // Column map. We begin by creating a hash of column id to column name.

134 StringMap column_name_map;

135 ParseMap(line, idx, &column_name_map);

136

137 // Now that we have the list of columns, we put them into a flat array.

138 // Rows will have value arrays of the same size, with indexes that

139 // correspond to the columns array. As we insert each column into the

140 // array, we also make an entry in columnMap so that we can look up the

141 // index given the column id.

142 columns_.reserve(column_name_map.size());

143

144 for (StringMap::const_iterator i = column_name_map.begin();

145 i != column_name_map.end(); ++i) {

146 column_map[i->first] = static_cast<int>(columns_.size());

147 MorkColumn col(i->first, i->second);

148 columns_.push_back(col);

149 }

150 } else if (StartsWithASCII(&line[idx], "<(", true)) {

151 // Value map.

152 ParseMap(line, idx, &value_map_);

153 } else if (line[idx] == '{' \|\| line[idx] == '[') {

154 // Table / table row.

155 ParseTable(line, idx, &column_map);

156 } else {

157 // Don't know, hopefully don't care.

158 }

159 }

160 return true;

161 }

162

163 // Parses a key/value map of the form

164 // <(k1=v1)(k2=v2)...>

165 bool MorkReader::ParseMap(const std::string& first_line,

166 size_t start_index,

167 StringMap* map) {

168 // If the first line is the a=c line (column map), just skip over it.

169 std::string line(first_line);

170 if (StartsWithASCII(line, "< <(a=c)>", true))

171 ReadLine(&line);

172

173 std::string key;

174 do {

175 size_t idx = start_index;

176 size_t len = line.size();

177 size_t token_start;

178

179 while (idx < len) {

180 switch (line[idx++]) {

181 case '(':

182 // Beginning of a key/value pair.

183 if (!key.empty()) {

184 DLOG(WARNING) << "unterminated key/value pair?";

185 key.clear();

186 }

187

188 token_start = idx;

189 while (idx < len && line[idx] != '=')

190 ++idx;

191 key.assign(&line[token_start], idx - token_start);

192 break;

193

194 case '=': {

195 // Beginning of the value.

196 if (key.empty()) {

197 DLOG(WARNING) << "stray value";

198 break;

199 }

200

201 token_start = idx;

202 while (idx < len && line[idx] != ')') {

203 if (line[idx] == '\\')

204 ++idx; // Skip escaped ')' characters.

205 ++idx;

206 }

207 size_t token_end = std::min(idx, len);

208 ++idx;

209

210 std::string value = MorkUnescape(

211 std::string(&line[token_start], token_end - token_start));

212 (*map)[key] = value;

213 key.clear();

214 break;

215 }

216 case '>':

217 // End of the map.

218 DLOG_IF(WARNING, key.empty()) <<

219 "map terminates inside of key/value pair";

220 return true;

221 }

222 }

223

224 // We should start reading the next line at the beginning.

225 start_index = 0;

226 } while (ReadLine(&line));

227

228 // We ran out of lines and the map never terminated. This probably indicates

229 // a parsing error.

230 DLOG(WARNING) << "didn't find end of key/value map";

231 return false;

232 }

233

234 // Parses a table row of the form [123(^45^67)..]

235 // (row id 123 has the value with id 67 for the column with id 45).

236 // A '^' prefix for a column or value references an entry in the column or

237 // value map. '=' is used as the separator when the value is a literal.

238 void MorkReader::ParseTable(const std::string& first_line,

239 size_t start_index,

240 const IndexMap* column_map) {

241 std::string line(first_line);

242

243 // Column index of the cell we're parsing, minus one if invalid.

244 int column_index = -1;

245

246 // Points to the current row we're parsing inside of the \|table_\|, will be

247 // NULL if we're not inside a row.

248 ColumnDataList* current_row = NULL;

249

250 bool in_meta_row = false;

251

252 do {

253 size_t idx = start_index;

254 size_t len = line.size();

255

256 while (idx < len) {

257 switch (line[idx++]) {

258 case '{':

259 // This marks the beginning of a table section. There's a lot of

260 // junk before the first row that looks like cell values but isn't.

261 // Skip to the first '['.

262 while (idx < len && line[idx] != '[') {

263 if (line[idx] == '{') {

264 in_meta_row = true; // The meta row is enclosed in { }

265 } else if (line[idx] == '}') {

266 in_meta_row = false;

267 }

268 ++idx;

269 }

270 break;

271

272 case '[': {

273 // Start of a new row. Consume the row id, up to the first '('.

274 // Row edits also have a table namespace, separated from the row id

275 // by a colon. We don't make use of the namespace, but we need to

276 // make sure not to consider it part of the row id.

277 if (current_row) {

278 DLOG(WARNING) << "unterminated row?";

279 current_row = NULL;

280 }

281

282 // Check for a '-' at the start of the id. This signifies that

283 // if the row already exists, we should delete all columns from it

284 // before adding the new values.

285 bool cut_columns;

286 if (idx < len && line[idx] == '-') {

287 cut_columns = true;

288 ++idx;

289 } else {

290 cut_columns = false;

291 }

292

293 // Locate the range of the ID.

294 size_t token_start = idx; // Index of the first char of the token.

295 while (idx < len &&

296 line[idx] != '(' &&

297 line[idx] != ']' &&

298 line[idx] != ':') {

299 ++idx;

300 }

301 size_t token_end = idx; // Index of the char following the token.

302 while (idx < len && line[idx] != '(' && line[idx] != ']') {

303 ++idx;

304 }

305

306 if (in_meta_row) {

307 // Need to create the meta row.

308 meta_row_.resize(columns_.size());

309 current_row = &meta_row_;

310 } else {

311 // Find or create the regular row for this.

312 IDString row_id(&line[token_start], token_end - token_start);

313 RowMap::iterator found_row = table_.find(row_id);

314 if (found_row == table_.end()) {

315 // We don't already have this row, create a new one for it.

316 current_row = new ColumnDataList(columns_.size());

317 table_[row_id] = current_row;

318 } else {

319 // The row already exists and we're adding/replacing things.

320 current_row = found_row->second;

321 }

322 }

323 if (cut_columns) {

324 for (size_t i = 0; i < current_row->size(); ++i)

325 (*current_row)[i].clear();

326 }

327 break;

328 }

329

330 case ']':

331 // We're done with the row.

332 current_row = NULL;

333 in_meta_row = false;

334 break;

335

336 case '(': {

337 if (!current_row) {

338 DLOG(WARNING) << "cell value outside of row";

339 break;

340 }

341

342 bool column_is_atom;

343 if (line[idx] == '^') {

344 column_is_atom = true;

345 ++idx; // This is not part of the column id, advance past it.

346 } else {

347 column_is_atom = false;

348 }

349 size_t token_start = idx;

350 while (idx < len && line[idx] != '^' && line[idx] != '=') {

351 if (line[idx] == '\\')

352 ++idx; // Skip escaped characters.

353 ++idx;

354 }

355

356 size_t token_end = std::min(idx, len);

357

358 IDString column;

359 if (column_is_atom)

360 column.assign(&line[token_start], token_end - token_start);

361 else

362 column = MorkUnescape(line.substr(token_start,

363 token_end - token_start));

364

365 IndexMap::const_iterator found_column = column_map->find(column);

366 if (found_column == column_map->end()) {

367 DLOG(WARNING) << "Column not in column map, discarding it";

368 column_index = -1;

369 } else {

370 column_index = found_column->second;

371 }

372 break;

373 }

374

375 case '=':

376 case '^': {

377 if (column_index == -1) {

378 DLOG(WARNING) << "stray ^ or = marker";

379 break;

380 }

381

382 bool value_is_atom = (line[idx - 1] == '^');

383 size_t token_start = idx - 1; // Include the '=' or '^' marker.

384 while (idx < len && line[idx] != ')') {

385 if (line[idx] == '\\')

386 ++idx; // Skip escaped characters.

387 ++idx;

388 }

389 size_t token_end = std::min(idx, len);

390 ++idx;

391

392 if (value_is_atom) {

393 (*current_row)[column_index].assign(&line[token_start],

394 token_end - token_start);

395 } else {

396 (*current_row)[column_index] =

397 MorkUnescape(line.substr(token_start, token_end - token_start));

398 }

399 column_index = -1;

400 }

401 break;

402 }

403 }

404

405 // Start parsing the next line at the beginning.

406 start_index = 0;

407 } while (current_row && ReadLine(&line));

408 }

409

410 bool MorkReader::ReadLine(std::string* line) {

411 line->resize(256);

412 std::getline(stream_, *line);

413 if (stream_.eof() \|\| stream_.bad())

414 return false;

415

416 while (!line->empty() && (*line)[line->size() - 1] == '\\') {

417 // There is a continuation for this line. Read it and append.

418 std::string new_line;

419 std::getline(stream_, new_line);

420 if (stream_.eof())

421 return false;

422 line->erase(line->size() - 1);

423 line->append(new_line);

424 }

425

426 return true;

427 }

428

429 void MorkReader::NormalizeValue(std::string* value) const {

430 if (value->empty())

431 return;

432 MorkReader::StringMap::const_iterator i;

433 switch (value->at(0)) {

434 case '^':

435 // Hex ID, lookup the name for it in the \|value_map_\|.

436 i = value_map_.find(value->substr(1));

437 if (i == value_map_.end())

438 value->clear();

439 else

440 *value = i->second;

441 break;

442 case '=':

443 // Just use the literal after the equals sign.

444 value->erase(value->begin());

445 break;

446 default:

447 // Anything else is invalid.

448 value->clear();

449 break;

450 }

451 }

452

453 // Source:

454 // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHis toryImporter.cpp

455

456 // Columns for entry (non-meta) history rows

457 enum {

458 kURLColumn,

459 kNameColumn,

460 kVisitCountColumn,

461 kHiddenColumn,

462 kTypedColumn,

463 kLastVisitColumn,

464 kColumnCount // Keep me last.

465 };

466

467 static const char * const gColumnNames[] = {

468 "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"

469 };

470

471 struct TableReadClosure {

472 explicit TableReadClosure(const MorkReader& r)

473 : reader(r),

474 swap_bytes(false),

475 byte_order_column(-1) {

476 for (int i = 0; i < kColumnCount; ++i)

477 column_indexes[i] = -1;

478 }

479

480 // Backpointers to the reader and history we're operating on.

481 const MorkReader& reader;

482

483 // Whether we need to swap bytes (file format is other-endian).

484 bool swap_bytes;

485

486 // Indexes of the columns that we care about.

487 int column_indexes[kColumnCount];

488 int byte_order_column;

489 };

490

491 void AddToHistory(MorkReader::ColumnDataList* column_values,

492 const TableReadClosure& data,

493 std::vector<history::URLRow>* rows) {

494 std::string values[kColumnCount];

495

496 for (size_t i = 0; i < kColumnCount; ++i) {

497 if (data.column_indexes[i] != -1) {

498 values[i] = column_values->at(data.column_indexes[i]);

499 data.reader.NormalizeValue(&values[i]);

500 // Do not import hidden records.

501 if (i == kHiddenColumn && values[i] == "1")

502 return;

503 }

504 }

505

506 GURL url(values[kURLColumn]);

507

508 if (CanImportURL(url)) {

509 history::URLRow row(url);

510

511 // title is really a UTF-16 string at this point

512 std::wstring title;

513 if (data.swap_bytes) {

514 CodepageToWide(values[kNameColumn], "UTF-16BE",

515 OnStringUtilConversionError::SKIP, &title);

516 } else {

517 CodepageToWide(values[kNameColumn], "UTF-16LE",

518 OnStringUtilConversionError::SKIP, &title);

519 }

520 row.set_title(title);

521

522 int count = atoi(values[kVisitCountColumn].c_str());

523 if (count == 0)

524 count = 1;

525 row.set_visit_count(count);

526

527 time_t date = StringToInt64(values[kLastVisitColumn]);

528 if (date != 0)

529 row.set_last_visit(Time::FromTimeT(date/1000000));

530

531 bool is_typed = (values[kTypedColumn] == "1");

532 if (is_typed)

533 row.set_typed_count(1);

534

535 rows->push_back(row);

536 }

537 }

538

539 // It sets up the file stream and loops over the lines in the file to

540 // parse them, then adds the resulting row set to history.

541 void ImportHistoryFromFirefox2(std::wstring file, MessageLoop* loop,

542 ProfileWriter* writer) {

543 MorkReader reader;

544 reader.Read(file);

545

546 // Gather up the column ids so we don't need to find them on each row

547 TableReadClosure data(reader);

548 const MorkReader::MorkColumnList& columns = reader.columns();

549 for (size_t i = 0; i < columns.size(); ++i) {

550 for (int j = 0; j < kColumnCount; ++j)

551 if (columns[i].name == gColumnNames[j]) {

552 data.column_indexes[j] = static_cast<int>(i);

553 break;

554 }

555 if (columns[i].name == "ByteOrder")

556 data.byte_order_column = static_cast<int>(i);

557 }

558

559 // Determine the byte order from the table's meta-row.

560 const MorkReader::ColumnDataList& meta_row = reader.meta_row();

561 if (!meta_row.empty() && data.byte_order_column != -1) {

562 std::string byte_order = meta_row[data.byte_order_column];

563 if (!byte_order.empty()) {

564 // Note whether the file uses a non-native byte ordering.

565 // If it does, we'll have to swap bytes for PRUnichar values.

566 // "BE" and "LE" are the only recognized values, anything

567 // else is garbage and the file will be treated as native-endian

568 // (no swapping).

569 std::string byte_order_value(byte_order);

570 reader.NormalizeValue(&byte_order_value);

571 data.swap_bytes = (byte_order_value == "BE");

572 }

573 }

574

575 std::vector<history::URLRow> rows;

576 for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)

577 AddToHistory(i->second, data, &rows);

578 if (!rows.empty())

579 loop->PostTask(FROM_HERE, NewRunnableMethod(writer,

580 &ProfileWriter::AddHistoryPage, rows));

581 }

OLD	NEW

« no previous file with comments | « chrome/browser/mork_reader.h ('k') | chrome/browser/title_chomper.h » ('j') | no next file with comments »