Index: utils/apidoc/mdn/crawl.js |
diff --git a/utils/apidoc/mdn/crawl.js b/utils/apidoc/mdn/crawl.js |
deleted file mode 100644 |
index 4a45fd412983a799a624d2ac543ed652b2d8406f..0000000000000000000000000000000000000000 |
--- a/utils/apidoc/mdn/crawl.js |
+++ /dev/null |
@@ -1,120 +0,0 @@ |
-// TODO(jacobr): convert this file to Dart once Dart supports all of the |
-// nodejs functionality used here. For example, search for all occurences of |
-// "http." and "fs." |
-var http = require('http'); |
-var fs = require('fs'); |
- |
-try { |
- fs.mkdirSync('output/crawl'); |
-} catch (e) { |
- // It doesn't matter if the directories already exist. |
-} |
- |
-var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); |
- |
-var cacheData = {}; |
- |
-function scrape(filename, link) { |
- console.log(link); |
- var httpsPrefix = "https://"; |
- var prefix = 'https://developer.mozilla.org/'; |
- var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; |
- if (link.indexOf(prefix) != 0 ) { |
- throw "Unexpected url: " + link; |
- } |
- var scrapePath = "/search?q=cache:" + link; |
- // We crawl content from googleusercontent.com so we don't have to worry about |
- // crawler politeness like we would have to if scraping developer.mozilla.org |
- // directly. |
- var options = { |
- host: 'webcache.googleusercontent.com', |
- path: scrapePath, |
- port: 80, |
- method: 'GET' |
- }; |
- |
- var req = http.request(options, function(res) { |
- res.setEncoding('utf8'); |
- var data=''; |
- |
- res.on('data', function(d) { |
- data += d; |
- }); |
- var onClose = function(e) { |
- console.log("Writing crawl result for " + link); |
- fs.writeFileSync("output/crawl/" + filename + ".html", data, 'utf8'); |
- } |
- res.on('close', onClose); |
- res.on('end', onClose); |
- }); |
- req.end(); |
- |
- req.on('error', function(e) { |
- throw "Error " + e + " scraping " + link; |
- }); |
-} |
- |
-for (var i = 0; i < domTypes.length; i++) { |
- var type = domTypes[i]; |
- |
- // Json containing the search results for the current type. |
- var data = fs.readFileSync("output/search/" + type + ".json"); |
- json = JSON.parse(data); |
- if (!('items' in json)) { |
- console.warn("No search results for " + type); |
- continue; |
- } |
- var items = json['items']; |
- |
- var entry = []; |
- cacheData[type] = entry; |
- |
- // Hardcode the correct matching url for a few types where the search engine |
- // gets the wrong answer. |
- var link = null; |
- if (type == 'Screen') { |
- link = 'https://developer.mozilla.org/en/DOM/window.screen'; |
- } else if (type == 'Text') { |
- link = 'https://developer.mozilla.org/en/DOM/Text'; |
- } else if (type == 'Touch') { |
- link = 'https://developer.mozilla.org/en/DOM/Touch'; |
- } else if (type == 'TouchEvent' || type == 'webkitTouchEvent' || type == 'WebkitTouchEvent' || type == 'WebKitTouchEvent') { |
- link = 'https://developer.mozilla.org/en/DOM/TouchEvent'; |
- } else if (type == 'HTMLSpanElement') { |
- link = 'https://developer.mozilla.org/en/HTML/Element/span'; |
- } else if (type == 'HTMLPreElement') { |
- link = 'https://developer.mozilla.org/en/HTML/Element/pre'; |
- } else if (type == 'HTMLFrameElement') { |
- link = 'https://developer.mozilla.org/en/HTML/Element/frame'; |
- } else if (type == 'HTMLFrameSetElement') { |
- link = 'https://developer.mozilla.org/en/HTML/Element/frameset'; |
- } else if (type == 'Geolocation') { |
- link = 'https://developer.mozilla.org/en/nsIDOMGeolocation;' |
- } else if (type == 'Notification') { |
- link = 'https://developer.mozilla.org/en/DOM/notification'; |
- } else if (type == 'IDBDatabase') { |
- link = 'https://developer.mozilla.org/en/IndexedDB/IDBDatabase' |
- } |
- if (link != null) { |
- entry.push({index: 0, link: link, title: type}); |
- scrape(type + 0, link); |
- continue; |
- } |
- |
- for (j = 0; j < items.length; j++) { |
- var item = items[j]; |
- var prefix = 'https://developer.mozilla.org/'; |
- var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; |
- // Be optimistic and replace article not found links with links to where the |
- // article should be. |
- link = item['link']; |
- if (link.indexOf(notFoundPrefix) == 0) { |
- link = prefix + link.substr(notFoundPrefix.length); |
- } |
- |
- entry.push({index: j, link: link, title: item['title']}); |
- scrape(type + j, link); |
- } |
-} |
- |
-fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' '), 'utf8'); |