OLD | NEW |
| (Empty) |
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ | |
2 /* ***** BEGIN LICENSE BLOCK ***** | |
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 | |
4 * | |
5 * The contents of this file are subject to the Mozilla Public License Version | |
6 * 1.1 (the "License"); you may not use this file except in compliance with | |
7 * the License. You may obtain a copy of the License at | |
8 * http://www.mozilla.org/MPL/ | |
9 * | |
10 * Software distributed under the License is distributed on an "AS IS" basis, | |
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | |
12 * for the specific language governing rights and limitations under the | |
13 * License. | |
14 * | |
15 * The Original Code is the Mork Reader. | |
16 * | |
17 * The Initial Developer of the Original Code is | |
18 * Google Inc. | |
19 * Portions created by the Initial Developer are Copyright (C) 2006 | |
20 * the Initial Developer. All Rights Reserved. | |
21 * | |
22 * Contributor(s): | |
23 * Brian Ryner <bryner@brianryner.com> (original author) | |
24 * | |
25 * Alternatively, the contents of this file may be used under the terms of | |
26 * either the GNU General Public License Version 2 or later (the "GPL"), or | |
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | |
28 * in which case the provisions of the GPL or the LGPL are applicable instead | |
29 * of those above. If you wish to allow use of your version of this file only | |
30 * under the terms of either the GPL or the LGPL, and not to allow others to | |
31 * use your version of this file under the terms of the MPL, indicate your | |
32 * decision by deleting the provisions above and replace them with the notice | |
33 * and other provisions required by the GPL or the LGPL. If you do not delete | |
34 * the provisions above, a recipient may use your version of this file under | |
35 * the terms of any one of the MPL, the GPL or the LGPL. | |
36 * | |
37 * ***** END LICENSE BLOCK ***** */ | |
38 | |
39 // Source: | |
40 // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp | |
41 // This file has been converted to google style. | |
42 | |
43 #include "chrome/browser/mork_reader.h" | |
44 | |
45 #include <algorithm> | |
46 | |
47 #include "base/logging.h" | |
48 #include "base/string_util.h" | |
49 #include "chrome/browser/firefox_importer_utils.h" | |
50 #include "chrome/browser/history/history_types.h" | |
51 | |
52 namespace { | |
53 | |
54 // Convert a hex character (0-9, A-F) to its corresponding byte value. | |
55 // Returns -1 if the character is invalid. | |
56 inline int HexCharToInt(char c) { | |
57 if ('0' <= c && c <= '9') | |
58 return c - '0'; | |
59 if ('A' <= c && c <= 'F') | |
60 return c - 'A' + 10; | |
61 return -1; | |
62 } | |
63 | |
64 // Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII | |
65 // characters. Additionally, '$' and '\' are backslash-escaped. | |
66 // The result of the unescape is in returned. | |
67 std::string MorkUnescape(const std::string& input) { | |
68 // We optimize for speed over space here -- size the result buffer to | |
69 // the size of the source, which is an upper bound on the size of the | |
70 // unescaped string. | |
71 std::string result; | |
72 size_t input_length = input.size(); | |
73 result.reserve(input_length); | |
74 | |
75 for (size_t i = 0; i < input_length; i++) { | |
76 char c = input[i]; | |
77 if (c == '\\') { | |
78 // Escaped literal, slip the backslash, append the next character. | |
79 i++; | |
80 if (i < input_length) | |
81 result.push_back(input[i]); | |
82 } else if (c == '$') { | |
83 // Dollar sign denotes a hex character. | |
84 if (i < input_length - 2) { | |
85 // Would be nice to use ToInteger() here, but it currently | |
86 // requires a null-terminated string. | |
87 int first = HexCharToInt(input[++i]); | |
88 int second = HexCharToInt(input[++i]); | |
89 if (first >= 0 && second >= 0) | |
90 result.push_back((first << 4) | second); | |
91 } | |
92 } else { | |
93 // Regular character, just append. | |
94 result.push_back(input[i]); | |
95 } | |
96 } | |
97 return result; | |
98 } | |
99 | |
100 } // namespace | |
101 | |
102 MorkReader::MorkReader() { | |
103 } | |
104 | |
105 MorkReader::~MorkReader() { | |
106 // Need to delete all the pointers to vectors we have in the table. | |
107 for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i) | |
108 delete i->second; | |
109 } | |
110 | |
111 bool MorkReader::Read(const std::wstring& filename) { | |
112 stream_.open(filename.c_str()); | |
113 if (!stream_.is_open()) | |
114 return false; | |
115 | |
116 std::string line; | |
117 if (!ReadLine(&line) || | |
118 line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0) | |
119 return false; // Unexpected file format. | |
120 | |
121 IndexMap column_map; | |
122 while (ReadLine(&line)) { | |
123 // Trim off leading spaces | |
124 size_t idx = 0; | |
125 size_t len = line.size(); | |
126 while (idx < len && line[idx] == ' ') | |
127 ++idx; | |
128 if (idx >= len) | |
129 continue; | |
130 | |
131 // Look at the line to figure out what section type this is | |
132 if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) { | |
133 // Column map. We begin by creating a hash of column id to column name. | |
134 StringMap column_name_map; | |
135 ParseMap(line, idx, &column_name_map); | |
136 | |
137 // Now that we have the list of columns, we put them into a flat array. | |
138 // Rows will have value arrays of the same size, with indexes that | |
139 // correspond to the columns array. As we insert each column into the | |
140 // array, we also make an entry in columnMap so that we can look up the | |
141 // index given the column id. | |
142 columns_.reserve(column_name_map.size()); | |
143 | |
144 for (StringMap::const_iterator i = column_name_map.begin(); | |
145 i != column_name_map.end(); ++i) { | |
146 column_map[i->first] = static_cast<int>(columns_.size()); | |
147 MorkColumn col(i->first, i->second); | |
148 columns_.push_back(col); | |
149 } | |
150 } else if (StartsWithASCII(&line[idx], "<(", true)) { | |
151 // Value map. | |
152 ParseMap(line, idx, &value_map_); | |
153 } else if (line[idx] == '{' || line[idx] == '[') { | |
154 // Table / table row. | |
155 ParseTable(line, idx, &column_map); | |
156 } else { | |
157 // Don't know, hopefully don't care. | |
158 } | |
159 } | |
160 return true; | |
161 } | |
162 | |
163 // Parses a key/value map of the form | |
164 // <(k1=v1)(k2=v2)...> | |
165 bool MorkReader::ParseMap(const std::string& first_line, | |
166 size_t start_index, | |
167 StringMap* map) { | |
168 // If the first line is the a=c line (column map), just skip over it. | |
169 std::string line(first_line); | |
170 if (StartsWithASCII(line, "< <(a=c)>", true)) | |
171 ReadLine(&line); | |
172 | |
173 std::string key; | |
174 do { | |
175 size_t idx = start_index; | |
176 size_t len = line.size(); | |
177 size_t token_start; | |
178 | |
179 while (idx < len) { | |
180 switch (line[idx++]) { | |
181 case '(': | |
182 // Beginning of a key/value pair. | |
183 if (!key.empty()) { | |
184 DLOG(WARNING) << "unterminated key/value pair?"; | |
185 key.clear(); | |
186 } | |
187 | |
188 token_start = idx; | |
189 while (idx < len && line[idx] != '=') | |
190 ++idx; | |
191 key.assign(&line[token_start], idx - token_start); | |
192 break; | |
193 | |
194 case '=': { | |
195 // Beginning of the value. | |
196 if (key.empty()) { | |
197 DLOG(WARNING) << "stray value"; | |
198 break; | |
199 } | |
200 | |
201 token_start = idx; | |
202 while (idx < len && line[idx] != ')') { | |
203 if (line[idx] == '\\') | |
204 ++idx; // Skip escaped ')' characters. | |
205 ++idx; | |
206 } | |
207 size_t token_end = std::min(idx, len); | |
208 ++idx; | |
209 | |
210 std::string value = MorkUnescape( | |
211 std::string(&line[token_start], token_end - token_start)); | |
212 (*map)[key] = value; | |
213 key.clear(); | |
214 break; | |
215 } | |
216 case '>': | |
217 // End of the map. | |
218 DLOG_IF(WARNING, key.empty()) << | |
219 "map terminates inside of key/value pair"; | |
220 return true; | |
221 } | |
222 } | |
223 | |
224 // We should start reading the next line at the beginning. | |
225 start_index = 0; | |
226 } while (ReadLine(&line)); | |
227 | |
228 // We ran out of lines and the map never terminated. This probably indicates | |
229 // a parsing error. | |
230 DLOG(WARNING) << "didn't find end of key/value map"; | |
231 return false; | |
232 } | |
233 | |
234 // Parses a table row of the form [123(^45^67)..] | |
235 // (row id 123 has the value with id 67 for the column with id 45). | |
236 // A '^' prefix for a column or value references an entry in the column or | |
237 // value map. '=' is used as the separator when the value is a literal. | |
238 void MorkReader::ParseTable(const std::string& first_line, | |
239 size_t start_index, | |
240 const IndexMap* column_map) { | |
241 std::string line(first_line); | |
242 | |
243 // Column index of the cell we're parsing, minus one if invalid. | |
244 int column_index = -1; | |
245 | |
246 // Points to the current row we're parsing inside of the |table_|, will be | |
247 // NULL if we're not inside a row. | |
248 ColumnDataList* current_row = NULL; | |
249 | |
250 bool in_meta_row = false; | |
251 | |
252 do { | |
253 size_t idx = start_index; | |
254 size_t len = line.size(); | |
255 | |
256 while (idx < len) { | |
257 switch (line[idx++]) { | |
258 case '{': | |
259 // This marks the beginning of a table section. There's a lot of | |
260 // junk before the first row that looks like cell values but isn't. | |
261 // Skip to the first '['. | |
262 while (idx < len && line[idx] != '[') { | |
263 if (line[idx] == '{') { | |
264 in_meta_row = true; // The meta row is enclosed in { } | |
265 } else if (line[idx] == '}') { | |
266 in_meta_row = false; | |
267 } | |
268 ++idx; | |
269 } | |
270 break; | |
271 | |
272 case '[': { | |
273 // Start of a new row. Consume the row id, up to the first '('. | |
274 // Row edits also have a table namespace, separated from the row id | |
275 // by a colon. We don't make use of the namespace, but we need to | |
276 // make sure not to consider it part of the row id. | |
277 if (current_row) { | |
278 DLOG(WARNING) << "unterminated row?"; | |
279 current_row = NULL; | |
280 } | |
281 | |
282 // Check for a '-' at the start of the id. This signifies that | |
283 // if the row already exists, we should delete all columns from it | |
284 // before adding the new values. | |
285 bool cut_columns; | |
286 if (idx < len && line[idx] == '-') { | |
287 cut_columns = true; | |
288 ++idx; | |
289 } else { | |
290 cut_columns = false; | |
291 } | |
292 | |
293 // Locate the range of the ID. | |
294 size_t token_start = idx; // Index of the first char of the token. | |
295 while (idx < len && | |
296 line[idx] != '(' && | |
297 line[idx] != ']' && | |
298 line[idx] != ':') { | |
299 ++idx; | |
300 } | |
301 size_t token_end = idx; // Index of the char following the token. | |
302 while (idx < len && line[idx] != '(' && line[idx] != ']') { | |
303 ++idx; | |
304 } | |
305 | |
306 if (in_meta_row) { | |
307 // Need to create the meta row. | |
308 meta_row_.resize(columns_.size()); | |
309 current_row = &meta_row_; | |
310 } else { | |
311 // Find or create the regular row for this. | |
312 IDString row_id(&line[token_start], token_end - token_start); | |
313 RowMap::iterator found_row = table_.find(row_id); | |
314 if (found_row == table_.end()) { | |
315 // We don't already have this row, create a new one for it. | |
316 current_row = new ColumnDataList(columns_.size()); | |
317 table_[row_id] = current_row; | |
318 } else { | |
319 // The row already exists and we're adding/replacing things. | |
320 current_row = found_row->second; | |
321 } | |
322 } | |
323 if (cut_columns) { | |
324 for (size_t i = 0; i < current_row->size(); ++i) | |
325 (*current_row)[i].clear(); | |
326 } | |
327 break; | |
328 } | |
329 | |
330 case ']': | |
331 // We're done with the row. | |
332 current_row = NULL; | |
333 in_meta_row = false; | |
334 break; | |
335 | |
336 case '(': { | |
337 if (!current_row) { | |
338 DLOG(WARNING) << "cell value outside of row"; | |
339 break; | |
340 } | |
341 | |
342 bool column_is_atom; | |
343 if (line[idx] == '^') { | |
344 column_is_atom = true; | |
345 ++idx; // This is not part of the column id, advance past it. | |
346 } else { | |
347 column_is_atom = false; | |
348 } | |
349 size_t token_start = idx; | |
350 while (idx < len && line[idx] != '^' && line[idx] != '=') { | |
351 if (line[idx] == '\\') | |
352 ++idx; // Skip escaped characters. | |
353 ++idx; | |
354 } | |
355 | |
356 size_t token_end = std::min(idx, len); | |
357 | |
358 IDString column; | |
359 if (column_is_atom) | |
360 column.assign(&line[token_start], token_end - token_start); | |
361 else | |
362 column = MorkUnescape(line.substr(token_start, | |
363 token_end - token_start)); | |
364 | |
365 IndexMap::const_iterator found_column = column_map->find(column); | |
366 if (found_column == column_map->end()) { | |
367 DLOG(WARNING) << "Column not in column map, discarding it"; | |
368 column_index = -1; | |
369 } else { | |
370 column_index = found_column->second; | |
371 } | |
372 break; | |
373 } | |
374 | |
375 case '=': | |
376 case '^': { | |
377 if (column_index == -1) { | |
378 DLOG(WARNING) << "stray ^ or = marker"; | |
379 break; | |
380 } | |
381 | |
382 bool value_is_atom = (line[idx - 1] == '^'); | |
383 size_t token_start = idx - 1; // Include the '=' or '^' marker. | |
384 while (idx < len && line[idx] != ')') { | |
385 if (line[idx] == '\\') | |
386 ++idx; // Skip escaped characters. | |
387 ++idx; | |
388 } | |
389 size_t token_end = std::min(idx, len); | |
390 ++idx; | |
391 | |
392 if (value_is_atom) { | |
393 (*current_row)[column_index].assign(&line[token_start], | |
394 token_end - token_start); | |
395 } else { | |
396 (*current_row)[column_index] = | |
397 MorkUnescape(line.substr(token_start, token_end - token_start)); | |
398 } | |
399 column_index = -1; | |
400 } | |
401 break; | |
402 } | |
403 } | |
404 | |
405 // Start parsing the next line at the beginning. | |
406 start_index = 0; | |
407 } while (current_row && ReadLine(&line)); | |
408 } | |
409 | |
410 bool MorkReader::ReadLine(std::string* line) { | |
411 line->resize(256); | |
412 std::getline(stream_, *line); | |
413 if (stream_.eof() || stream_.bad()) | |
414 return false; | |
415 | |
416 while (!line->empty() && (*line)[line->size() - 1] == '\\') { | |
417 // There is a continuation for this line. Read it and append. | |
418 std::string new_line; | |
419 std::getline(stream_, new_line); | |
420 if (stream_.eof()) | |
421 return false; | |
422 line->erase(line->size() - 1); | |
423 line->append(new_line); | |
424 } | |
425 | |
426 return true; | |
427 } | |
428 | |
429 void MorkReader::NormalizeValue(std::string* value) const { | |
430 if (value->empty()) | |
431 return; | |
432 MorkReader::StringMap::const_iterator i; | |
433 switch (value->at(0)) { | |
434 case '^': | |
435 // Hex ID, lookup the name for it in the |value_map_|. | |
436 i = value_map_.find(value->substr(1)); | |
437 if (i == value_map_.end()) | |
438 value->clear(); | |
439 else | |
440 *value = i->second; | |
441 break; | |
442 case '=': | |
443 // Just use the literal after the equals sign. | |
444 value->erase(value->begin()); | |
445 break; | |
446 default: | |
447 // Anything else is invalid. | |
448 value->clear(); | |
449 break; | |
450 } | |
451 } | |
452 | |
453 // Source: | |
454 // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHis
toryImporter.cpp | |
455 | |
456 // Columns for entry (non-meta) history rows | |
457 enum { | |
458 kURLColumn, | |
459 kNameColumn, | |
460 kVisitCountColumn, | |
461 kHiddenColumn, | |
462 kTypedColumn, | |
463 kLastVisitColumn, | |
464 kColumnCount // Keep me last. | |
465 }; | |
466 | |
467 static const char * const gColumnNames[] = { | |
468 "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate" | |
469 }; | |
470 | |
471 struct TableReadClosure { | |
472 explicit TableReadClosure(const MorkReader& r) | |
473 : reader(r), | |
474 swap_bytes(false), | |
475 byte_order_column(-1) { | |
476 for (int i = 0; i < kColumnCount; ++i) | |
477 column_indexes[i] = -1; | |
478 } | |
479 | |
480 // Backpointers to the reader and history we're operating on. | |
481 const MorkReader& reader; | |
482 | |
483 // Whether we need to swap bytes (file format is other-endian). | |
484 bool swap_bytes; | |
485 | |
486 // Indexes of the columns that we care about. | |
487 int column_indexes[kColumnCount]; | |
488 int byte_order_column; | |
489 }; | |
490 | |
491 void AddToHistory(MorkReader::ColumnDataList* column_values, | |
492 const TableReadClosure& data, | |
493 std::vector<history::URLRow>* rows) { | |
494 std::string values[kColumnCount]; | |
495 | |
496 for (size_t i = 0; i < kColumnCount; ++i) { | |
497 if (data.column_indexes[i] != -1) { | |
498 values[i] = column_values->at(data.column_indexes[i]); | |
499 data.reader.NormalizeValue(&values[i]); | |
500 // Do not import hidden records. | |
501 if (i == kHiddenColumn && values[i] == "1") | |
502 return; | |
503 } | |
504 } | |
505 | |
506 GURL url(values[kURLColumn]); | |
507 | |
508 if (CanImportURL(url)) { | |
509 history::URLRow row(url); | |
510 | |
511 // title is really a UTF-16 string at this point | |
512 std::wstring title; | |
513 if (data.swap_bytes) { | |
514 CodepageToWide(values[kNameColumn], "UTF-16BE", | |
515 OnStringUtilConversionError::SKIP, &title); | |
516 } else { | |
517 CodepageToWide(values[kNameColumn], "UTF-16LE", | |
518 OnStringUtilConversionError::SKIP, &title); | |
519 } | |
520 row.set_title(title); | |
521 | |
522 int count = atoi(values[kVisitCountColumn].c_str()); | |
523 if (count == 0) | |
524 count = 1; | |
525 row.set_visit_count(count); | |
526 | |
527 time_t date = StringToInt64(values[kLastVisitColumn]); | |
528 if (date != 0) | |
529 row.set_last_visit(Time::FromTimeT(date/1000000)); | |
530 | |
531 bool is_typed = (values[kTypedColumn] == "1"); | |
532 if (is_typed) | |
533 row.set_typed_count(1); | |
534 | |
535 rows->push_back(row); | |
536 } | |
537 } | |
538 | |
539 // It sets up the file stream and loops over the lines in the file to | |
540 // parse them, then adds the resulting row set to history. | |
541 void ImportHistoryFromFirefox2(std::wstring file, MessageLoop* loop, | |
542 ProfileWriter* writer) { | |
543 MorkReader reader; | |
544 reader.Read(file); | |
545 | |
546 // Gather up the column ids so we don't need to find them on each row | |
547 TableReadClosure data(reader); | |
548 const MorkReader::MorkColumnList& columns = reader.columns(); | |
549 for (size_t i = 0; i < columns.size(); ++i) { | |
550 for (int j = 0; j < kColumnCount; ++j) | |
551 if (columns[i].name == gColumnNames[j]) { | |
552 data.column_indexes[j] = static_cast<int>(i); | |
553 break; | |
554 } | |
555 if (columns[i].name == "ByteOrder") | |
556 data.byte_order_column = static_cast<int>(i); | |
557 } | |
558 | |
559 // Determine the byte order from the table's meta-row. | |
560 const MorkReader::ColumnDataList& meta_row = reader.meta_row(); | |
561 if (!meta_row.empty() && data.byte_order_column != -1) { | |
562 std::string byte_order = meta_row[data.byte_order_column]; | |
563 if (!byte_order.empty()) { | |
564 // Note whether the file uses a non-native byte ordering. | |
565 // If it does, we'll have to swap bytes for PRUnichar values. | |
566 // "BE" and "LE" are the only recognized values, anything | |
567 // else is garbage and the file will be treated as native-endian | |
568 // (no swapping). | |
569 std::string byte_order_value(byte_order); | |
570 reader.NormalizeValue(&byte_order_value); | |
571 data.swap_bytes = (byte_order_value == "BE"); | |
572 } | |
573 } | |
574 | |
575 std::vector<history::URLRow> rows; | |
576 for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i) | |
577 AddToHistory(i->second, data, &rows); | |
578 if (!rows.empty()) | |
579 loop->PostTask(FROM_HERE, NewRunnableMethod(writer, | |
580 &ProfileWriter::AddHistoryPage, rows)); | |
581 } | |
OLD | NEW |