| OLD | NEW |
| (Empty) |
| 1 // TODO(jacobr): convert this file to Dart once Dart supports all of the | |
| 2 // nodejs functionality used here. For example, search for all occurences of | |
| 3 // "http." and "fs." | |
| 4 var http = require('http'); | |
| 5 var fs = require('fs'); | |
| 6 | |
| 7 try { | |
| 8 fs.mkdirSync('output/crawl'); | |
| 9 } catch (e) { | |
| 10 // It doesn't matter if the directories already exist. | |
| 11 } | |
| 12 | |
| 13 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); | |
| 14 | |
| 15 var cacheData = {}; | |
| 16 | |
| 17 function scrape(filename, link) { | |
| 18 console.log(link); | |
| 19 var httpsPrefix = "https://"; | |
| 20 var prefix = 'https://developer.mozilla.org/'; | |
| 21 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | |
| 22 if (link.indexOf(prefix) != 0 ) { | |
| 23 throw "Unexpected url: " + link; | |
| 24 } | |
| 25 var scrapePath = "/search?q=cache:" + link; | |
| 26 // We crawl content from googleusercontent.com so we don't have to worry about | |
| 27 // crawler politeness like we would have to if scraping developer.mozilla.org | |
| 28 // directly. | |
| 29 var options = { | |
| 30 host: 'webcache.googleusercontent.com', | |
| 31 path: scrapePath, | |
| 32 port: 80, | |
| 33 method: 'GET' | |
| 34 }; | |
| 35 | |
| 36 var req = http.request(options, function(res) { | |
| 37 res.setEncoding('utf8'); | |
| 38 var data=''; | |
| 39 | |
| 40 res.on('data', function(d) { | |
| 41 data += d; | |
| 42 }); | |
| 43 var onClose = function(e) { | |
| 44 console.log("Writing crawl result for " + link); | |
| 45 fs.writeFileSync("output/crawl/" + filename + ".html", data, 'utf8'); | |
| 46 } | |
| 47 res.on('close', onClose); | |
| 48 res.on('end', onClose); | |
| 49 }); | |
| 50 req.end(); | |
| 51 | |
| 52 req.on('error', function(e) { | |
| 53 throw "Error " + e + " scraping " + link; | |
| 54 }); | |
| 55 } | |
| 56 | |
| 57 for (var i = 0; i < domTypes.length; i++) { | |
| 58 var type = domTypes[i]; | |
| 59 | |
| 60 // Json containing the search results for the current type. | |
| 61 var data = fs.readFileSync("output/search/" + type + ".json"); | |
| 62 json = JSON.parse(data); | |
| 63 if (!('items' in json)) { | |
| 64 console.warn("No search results for " + type); | |
| 65 continue; | |
| 66 } | |
| 67 var items = json['items']; | |
| 68 | |
| 69 var entry = []; | |
| 70 cacheData[type] = entry; | |
| 71 | |
| 72 // Hardcode the correct matching url for a few types where the search engine | |
| 73 // gets the wrong answer. | |
| 74 var link = null; | |
| 75 if (type == 'Screen') { | |
| 76 link = 'https://developer.mozilla.org/en/DOM/window.screen'; | |
| 77 } else if (type == 'Text') { | |
| 78 link = 'https://developer.mozilla.org/en/DOM/Text'; | |
| 79 } else if (type == 'Touch') { | |
| 80 link = 'https://developer.mozilla.org/en/DOM/Touch'; | |
| 81 } else if (type == 'TouchEvent' || type == 'webkitTouchEvent' || type == 'Webk
itTouchEvent' || type == 'WebKitTouchEvent') { | |
| 82 link = 'https://developer.mozilla.org/en/DOM/TouchEvent'; | |
| 83 } else if (type == 'HTMLSpanElement') { | |
| 84 link = 'https://developer.mozilla.org/en/HTML/Element/span'; | |
| 85 } else if (type == 'HTMLPreElement') { | |
| 86 link = 'https://developer.mozilla.org/en/HTML/Element/pre'; | |
| 87 } else if (type == 'HTMLFrameElement') { | |
| 88 link = 'https://developer.mozilla.org/en/HTML/Element/frame'; | |
| 89 } else if (type == 'HTMLFrameSetElement') { | |
| 90 link = 'https://developer.mozilla.org/en/HTML/Element/frameset'; | |
| 91 } else if (type == 'Geolocation') { | |
| 92 link = 'https://developer.mozilla.org/en/nsIDOMGeolocation;' | |
| 93 } else if (type == 'Notification') { | |
| 94 link = 'https://developer.mozilla.org/en/DOM/notification'; | |
| 95 } else if (type == 'IDBDatabase') { | |
| 96 link = 'https://developer.mozilla.org/en/IndexedDB/IDBDatabase' | |
| 97 } | |
| 98 if (link != null) { | |
| 99 entry.push({index: 0, link: link, title: type}); | |
| 100 scrape(type + 0, link); | |
| 101 continue; | |
| 102 } | |
| 103 | |
| 104 for (j = 0; j < items.length; j++) { | |
| 105 var item = items[j]; | |
| 106 var prefix = 'https://developer.mozilla.org/'; | |
| 107 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | |
| 108 // Be optimistic and replace article not found links with links to where the | |
| 109 // article should be. | |
| 110 link = item['link']; | |
| 111 if (link.indexOf(notFoundPrefix) == 0) { | |
| 112 link = prefix + link.substr(notFoundPrefix.length); | |
| 113 } | |
| 114 | |
| 115 entry.push({index: j, link: link, title: item['title']}); | |
| 116 scrape(type + j, link); | |
| 117 } | |
| 118 } | |
| 119 | |
| 120 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); | |
| OLD | NEW |