OLD | NEW |
| (Empty) |
1 // TODO(jacobr): convert this file to Dart once Dart supports all of the | |
2 // nodejs functionality used here. For example, search for all occurences of | |
3 // "http." and "fs." | |
4 var http = require('http'); | |
5 var fs = require('fs'); | |
6 | |
7 try { | |
8 fs.mkdirSync('output/crawl'); | |
9 } catch (e) { | |
10 // It doesn't matter if the directories already exist. | |
11 } | |
12 | |
13 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); | |
14 | |
15 var cacheData = {}; | |
16 | |
17 function scrape(filename, link) { | |
18 console.log(link); | |
19 var httpsPrefix = "https://"; | |
20 var prefix = 'https://developer.mozilla.org/'; | |
21 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | |
22 if (link.indexOf(prefix) != 0 ) { | |
23 throw "Unexpected url: " + link; | |
24 } | |
25 var scrapePath = "/search?q=cache:" + link; | |
26 // We crawl content from googleusercontent.com so we don't have to worry about | |
27 // crawler politeness like we would have to if scraping developer.mozilla.org | |
28 // directly. | |
29 var options = { | |
30 host: 'webcache.googleusercontent.com', | |
31 path: scrapePath, | |
32 port: 80, | |
33 method: 'GET' | |
34 }; | |
35 | |
36 var req = http.request(options, function(res) { | |
37 res.setEncoding('utf8'); | |
38 var data=''; | |
39 | |
40 res.on('data', function(d) { | |
41 data += d; | |
42 }); | |
43 var onClose = function(e) { | |
44 console.log("Writing crawl result for " + link); | |
45 fs.writeFileSync("output/crawl/" + filename + ".html", data, 'utf8'); | |
46 } | |
47 res.on('close', onClose); | |
48 res.on('end', onClose); | |
49 }); | |
50 req.end(); | |
51 | |
52 req.on('error', function(e) { | |
53 throw "Error " + e + " scraping " + link; | |
54 }); | |
55 } | |
56 | |
57 for (var i = 0; i < domTypes.length; i++) { | |
58 var type = domTypes[i]; | |
59 | |
60 // Json containing the search results for the current type. | |
61 var data = fs.readFileSync("output/search/" + type + ".json"); | |
62 json = JSON.parse(data); | |
63 if (!('items' in json)) { | |
64 console.warn("No search results for " + type); | |
65 continue; | |
66 } | |
67 var items = json['items']; | |
68 | |
69 var entry = []; | |
70 cacheData[type] = entry; | |
71 | |
72 // Hardcode the correct matching url for a few types where the search engine | |
73 // gets the wrong answer. | |
74 var link = null; | |
75 if (type == 'Screen') { | |
76 link = 'https://developer.mozilla.org/en/DOM/window.screen'; | |
77 } else if (type == 'Text') { | |
78 link = 'https://developer.mozilla.org/en/DOM/Text'; | |
79 } else if (type == 'Touch') { | |
80 link = 'https://developer.mozilla.org/en/DOM/Touch'; | |
81 } else if (type == 'TouchEvent' || type == 'webkitTouchEvent' || type == 'Webk
itTouchEvent' || type == 'WebKitTouchEvent') { | |
82 link = 'https://developer.mozilla.org/en/DOM/TouchEvent'; | |
83 } else if (type == 'HTMLSpanElement') { | |
84 link = 'https://developer.mozilla.org/en/HTML/Element/span'; | |
85 } else if (type == 'HTMLPreElement') { | |
86 link = 'https://developer.mozilla.org/en/HTML/Element/pre'; | |
87 } else if (type == 'HTMLFrameElement') { | |
88 link = 'https://developer.mozilla.org/en/HTML/Element/frame'; | |
89 } else if (type == 'HTMLFrameSetElement') { | |
90 link = 'https://developer.mozilla.org/en/HTML/Element/frameset'; | |
91 } else if (type == 'Geolocation') { | |
92 link = 'https://developer.mozilla.org/en/nsIDOMGeolocation;' | |
93 } else if (type == 'Notification') { | |
94 link = 'https://developer.mozilla.org/en/DOM/notification'; | |
95 } else if (type == 'IDBDatabase') { | |
96 link = 'https://developer.mozilla.org/en/IndexedDB/IDBDatabase' | |
97 } | |
98 if (link != null) { | |
99 entry.push({index: 0, link: link, title: type}); | |
100 scrape(type + 0, link); | |
101 continue; | |
102 } | |
103 | |
104 for (j = 0; j < items.length; j++) { | |
105 var item = items[j]; | |
106 var prefix = 'https://developer.mozilla.org/'; | |
107 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | |
108 // Be optimistic and replace article not found links with links to where the | |
109 // article should be. | |
110 link = item['link']; | |
111 if (link.indexOf(notFoundPrefix) == 0) { | |
112 link = prefix + link.substr(notFoundPrefix.length); | |
113 } | |
114 | |
115 entry.push({index: j, link: link, title: item['title']}); | |
116 scrape(type + j, link); | |
117 } | |
118 } | |
119 | |
120 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); | |
OLD | NEW |