├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── index.js ├── lib ├── addDate.js ├── categories.js ├── dial.js ├── page.js ├── random.js └── revisions.js ├── package.json └── test ├── output └── .gitignore └── test.js /.gitignore: -------------------------------------------------------------------------------- 1 | lib-cov 2 | *.seed 3 | *.log 4 | *.csv 5 | *.dat 6 | *.out 7 | *.pid 8 | *.gz 9 | 10 | pids 11 | logs 12 | results 13 | 14 | npm-debug.log 15 | node_modules 16 | 17 | .DS_Store -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.12" 4 | - "0.11" 5 | - "0.10" 6 | notifications: 7 | email: false -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Chris Wilson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | node-wikipedia 2 | ============== 3 | 4 | Node.js wrapper for the [Wikipedia API](http://en.wikipedia.org/w/api.php) 5 | 6 | [![Build Status](https://travis-ci.org/wilson428/node-wikipedia.png)](https://travis-ci.org/wilson428/node-wikipedia) 7 | 8 | #Installation 9 | 10 | npm install node-wikipedia 11 | 12 | #Demo 13 | 14 | var wikipedia = require("node-wikipedia"); 15 | 16 | wikipedia.page.data("Clifford_Brown", { content: true }, function(response) { 17 | // structured information on the page for Clifford Brown (wikilinks, references, categories, etc.) 18 | }); 19 | 20 | wikipedia.revisions.all("Miles_Davis", { comment: true }, function(response) { 21 | // info on each revision made to Miles Davis' page 22 | }); 23 | 24 | wikipedia.categories.tree( 25 | "Philadelphia_Phillies", 26 | function(tree) { 27 | //nested data on the category page for all Phillies players 28 | } 29 | ); 30 | 31 | #Philosophy 32 | 33 | The [MediaWiki API](http://en.wikipedia.org/w/api.php) is wonderfully permissive and horribly documented. This is a lightweight wrapper. In addition to providing a basic interface for making HTTP requests to the API, it bundles some requests so that one needn't bother with pagination and so forth. 34 | 35 | #Under the Hood 36 | `dial.js` makes API requests, accepting parameters as an object, options as an object, and a callback. 37 | 38 | #License 39 | This script is provided free and open-source under the MIT license. If you use it, you are politely encouraged to link to this repo. -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | module.exports.categories = require(__dirname + "/lib/categories"); 2 | module.exports.revisions = require(__dirname + "/lib/revisions"); 3 | module.exports.page = require(__dirname + "/lib/page"); 4 | module.exports.random = require("./lib/random"); -------------------------------------------------------------------------------- /lib/addDate.js: -------------------------------------------------------------------------------- 1 | var d3 = require("d3"), 2 | 3 | // accepts date in Wiki or JS format, delta is # of days 4 | module.exports = function(date, delta) { 5 | delta = delta || 0; 6 | var dateFormat = d3.time.format("%Y-%m-%dT%H:%M:%SZ"); 7 | if (typeof date === "string") { 8 | var date = dateFormat.parse(date); 9 | } 10 | var next = new Date(date.getTime() + 1000 * 3600 * 24 * delta); 11 | return dateFormat(next); 12 | } -------------------------------------------------------------------------------- /lib/categories.js: -------------------------------------------------------------------------------- 1 | var log = require("npmlog"); 2 | var dial = require("./dial"); 3 | 4 | // get all pages and subcategories articles in a category 5 | // does NOT look into subcategories. For that, you need the tree function (below) 6 | // for categories with more than 500 members, callback fires after all are retrieved 7 | // if opts has a "onEach" function, fires on each batch of 500 8 | var getAll = module.exports.all = function(category, opts, callback) { 9 | if (arguments.length < 3) { 10 | callback = opts; 11 | opts = {}; 12 | } 13 | 14 | opts.limit = opts.limit || 500; 15 | opts["continue"] = opts["continue"] || ""; 16 | opts.count = opts.count || 0; 17 | 18 | opts.data = opts.data || []; 19 | opts.each = opts.each || function() {}; 20 | 21 | if (category.indexOf("Category:") == -1) { 22 | category = "Category:" + category 23 | } 24 | 25 | var params = { 26 | format: "json", 27 | action: "query", 28 | list: "categorymembers", 29 | cmtitle: category, 30 | redirects: false, 31 | cmprop: "ids|title|type|timestamp", 32 | cmlimit: opts.limit, 33 | cmcontinue: opts["continue"], 34 | lang: opts.lang || 'en' 35 | }; 36 | 37 | if (opts.since) { 38 | params.cmsort = "timestamp"; 39 | params.cmstart = opts.since; 40 | } 41 | 42 | dial(params, function(batch) { 43 | opts.count += batch.query.categorymembers.length; 44 | opts.data = opts.data.concat(batch.query.categorymembers); 45 | 46 | if (opts.max && opts.data.length >= opts.max) { 47 | opts.data = opts.data.slice(0, opts.max); 48 | opts.each(batch.query.categorymembers, { terminal: true }); 49 | return callback(opts.data); 50 | } 51 | 52 | // if cmstart is specified, API returns a new cmstart val instead of a cmcontinue val 53 | if (batch["query-continue"] && batch["query-continue"].categorymembers.cmcontinue) { 54 | log.info("Moving on to " + batch["query-continue"]["categorymembers"]["cmcontinue"]); 55 | opts["continue"] = batch["query-continue"]["categorymembers"]["cmcontinue"]; 56 | 57 | opts.each(batch.query.categorymembers, { terminal: false }); 58 | getAll(category, opts, callback); 59 | } else if (batch["query-continue"] && batch["query-continue"].categorymembers.cmstart) { 60 | log.info("Moving on to " + batch["query-continue"].categorymembers.cmstart); 61 | opts.since = batch["query-continue"].categorymembers.cmstart; 62 | 63 | opts.each(batch.query.categorymembers, { terminal: false }); 64 | getAll(category, opts, callback); 65 | } else { 66 | opts.each(batch.query.categorymembers, { terminal: true }); 67 | log.info("Got " + opts.count + " entries for category " + category); 68 | return callback(opts.data); 69 | } 70 | }); 71 | } 72 | 73 | var getTree = module.exports.tree = function(category, opts, callback, depth) { 74 | if (arguments.length < 3) { 75 | callback = opts; 76 | opts = {}; 77 | } 78 | 79 | depth = depth || 0; 80 | getAll(category, function(members) { 81 | var count = members.length, 82 | branch = { 83 | name: category.replace("Category:", "").replace(/_/g, " "), 84 | pages: [], 85 | subcategories: [] 86 | }; 87 | 88 | opts.parent = branch; 89 | 90 | members.forEach(function(member) { 91 | if (opts.each) { 92 | opts.each(category, member.title, depth, opts); 93 | } 94 | if (member.ns === 0) { 95 | count -= 1; 96 | branch.pages.push(member.title); 97 | } else if (member.ns === 14) { 98 | if (opts.maxdepth && depth >= opts.maxdepth) { 99 | count -= 1; 100 | } else { 101 | getTree(member.title, opts, function(data) { 102 | count -= 1; 103 | branch.subcategories.push(data); 104 | if (count === 0) { 105 | callback(branch); 106 | } 107 | }, depth + 1); 108 | } 109 | } else { 110 | count -= 1; 111 | } 112 | }); 113 | if (count === 0) { 114 | callback(branch); 115 | } 116 | }); 117 | } 118 | 119 | // get revisions to pages using a category as a generator 120 | var getGenerator = module.exports.generator = function(category, opts, callback) { 121 | if (arguments.length < 3) { 122 | callback = opts; 123 | opts = {}; 124 | } 125 | 126 | opts.limit = opts.limit || 500; 127 | opts["continue"] = opts["continue"] || ""; 128 | 129 | if (category.indexOf("Category:") == -1) { 130 | category = "Category:" + category 131 | } 132 | 133 | //http://en.wikipedia.org/w/api.php?format=json&prop=revisions&action=query&generator=categorymembers&gcmtitle=Category:Living_people&gcmlimit=500 134 | var params = { 135 | format: "json", 136 | action: "query", 137 | prop: "revisions", 138 | generator: "categorymembers", 139 | gcmtitle: category, 140 | gcmlimit: opts.limit, 141 | gcmcontinue: opts["continue"] 142 | }; 143 | 144 | dial(params, function(revs) { 145 | callback(revs.query.pages); 146 | 147 | // if there's another page of results, get that too 148 | if (revs["query-continue"]) { 149 | log.info("Moving on to " + revs["query-continue"]["categorymembers"]["gcmcontinue"]); 150 | opts["continue"] = revs["query-continue"]["categorymembers"]["gcmcontinue"]; 151 | revisionsByCategory(category, opts, callback); 152 | } else { 153 | log.log("Finished getting category revision times for " + category); 154 | if (opts.onComplete) { 155 | opts.onComplete(); 156 | } 157 | } 158 | }); 159 | } -------------------------------------------------------------------------------- /lib/dial.js: -------------------------------------------------------------------------------- 1 | var downcache = require("downcache"), 2 | request = require("request"), 3 | urlparse = require("url"), 4 | log = require("npmlog"); 5 | 6 | // dial the Wikipedia API, fire callback on result 7 | module.exports = function(params, opts, callback) { 8 | params.format = "json"; 9 | //params.lang = params.lang || "en"; 10 | if (arguments.length < 3) { 11 | callback = opts; 12 | opts = {}; 13 | } 14 | var url = "http://en.wikipedia.org/w/api.php" + urlparse.format({ query: params }); 15 | log.verbose(url); 16 | 17 | if (opts.cache) { 18 | if (typeof opts.cache === "string") { 19 | opts.cache = { 20 | dir: opts.cache 21 | }; 22 | } 23 | opts.cache.json = true; 24 | downcache(url, opts.cache, function(err, json) { 25 | callback(json, url); 26 | }); 27 | } else { 28 | request(url, { json: true }, function(err, resp, body) { 29 | if (err) { 30 | log.error(url, err); 31 | return; 32 | } else { 33 | callback(body, url); 34 | } 35 | }); 36 | } 37 | } -------------------------------------------------------------------------------- /lib/page.js: -------------------------------------------------------------------------------- 1 | var dial = require("./dial"), 2 | cheerio = require("cheerio"); 3 | 4 | var getData = module.exports.data = function(page, opts, callback) { 5 | if (arguments.length < 3) { 6 | callback = opts; 7 | opts = {}; 8 | } 9 | 10 | var params = { 11 | action: "parse", 12 | //oldid: revid, 13 | page: page, 14 | prop: "categories|externallinks|links", 15 | lang: opts.lang || 'en' 16 | } 17 | 18 | if (opts.content) { 19 | params.prop += "|text"; 20 | } 21 | 22 | if (opts.wikitext) { 23 | params.prop += "|wikitext"; 24 | } 25 | 26 | if (opts.redirects || typeof opts.redirects === "undefined") { 27 | params.redirects = true; 28 | } 29 | 30 | dial(params, function(d, url) { 31 | // include the original page as second parameter in case redirect changed it 32 | callback(d.parse, page, url); 33 | }); 34 | } 35 | 36 | module.exports.image = function(page, callback) { 37 | getData(page, { content: true }, function(data) { 38 | if (!data) { 39 | return false; 40 | } 41 | var $ = cheerio.load("" + data.text['*'] + ""), 42 | images = $(".infobox img"); 43 | 44 | if (images.length > 0) { 45 | callback(images[0].attribs.src); 46 | } else { 47 | console.log("No image found for " + page); 48 | } 49 | }); 50 | } 51 | 52 | module.exports.description = function(page, callback) { 53 | var pattern = new RegExp("SHORT DESCRIPTION ?= ?(.+)", "i"); 54 | 55 | getData(page, { wikitext: true }, function(data, url) { 56 | if (!data) { 57 | return false; 58 | } 59 | 60 | var description = pattern.exec(data.wikitext['*']); 61 | 62 | if (description && description.length > 1) { 63 | var desc = description[1]; 64 | callback(parseWikiText(description[1])); 65 | } else { 66 | callback(null); 67 | } 68 | 69 | }); 70 | } 71 | 72 | function parseWikiText(s) { 73 | // e.g. {{Persondata|NAME=Gordh, Gordon|ALTERNATIVE NAMES=|SHORT DESCRIPTION=Entomologist|DATE OF BIRTH=1945|PLACE OF BIRTH=[[USA]]|DATE OF DEATH=|PLACE OF DEATH=USA}} 74 | s = s.split(/\|[A-Z ]{5,100}/)[0]; 75 | 76 | var pattern = /\[\[(.*?)\]\]/g, 77 | output = s, 78 | m; 79 | 80 | // e.g. | SHORT DESCRIPTION=[[United States Senate|U.S. Senator]] from [[Massachusetts]], [[John Kerry presidential campaign, 2004|2004 presidential nominee]] for the [[Democratic Party (United States)|Democratic Party]] 81 | while (m = pattern.exec(s)) { 82 | if (m[1].split("|").length > 1) { 83 | var sub = m[1].split("|")[1]; 84 | } else { 85 | var sub = m[1]; 86 | } 87 | output = output.replace(m[0], sub); 88 | } 89 | return output.replace(/\s+/g, " "); 90 | } -------------------------------------------------------------------------------- /lib/random.js: -------------------------------------------------------------------------------- 1 | var dial = require("./dial"); 2 | 3 | var getData = module.exports = function(opts, callback) { 4 | if (arguments.length == 1) { 5 | callback = opts; 6 | opts = {}; 7 | } 8 | 9 | var params = { 10 | action: "query", 11 | list: "random", 12 | rnlimit: opts.n || 10, 13 | rnnamespace: 0, 14 | lang: opts.lang || 'en' 15 | } 16 | 17 | dial(params, function(response, url) { 18 | callback(response, url); 19 | }); 20 | } 21 | 22 | -------------------------------------------------------------------------------- /lib/revisions.js: -------------------------------------------------------------------------------- 1 | var dial = require("./dial"), 2 | log = require("npmlog"), 3 | fs = require("fs"); 4 | 5 | var values = function(map) { 6 | var values = []; 7 | for (var key in map) values.push(map[key]); 8 | return values; 9 | }; 10 | 11 | var getRevisions = function(params, opts, callback) { 12 | dial(params, opts, function(response) { 13 | opts.data = opts.data || []; 14 | 15 | if (typeof response == "string") { 16 | response = JSON.parse(response); 17 | } 18 | 19 | if (!response || !response.query) { 20 | log.error("Couldn't read response:", typeof response, JSON.stringify(params)); 21 | //fs.writeFileSync("/Users/cwilson1130/Desktop/wikipedia/errors/" + params.titles + ".json", JSON.stringify(response, null, 2)); 22 | return false; 23 | } 24 | if (response.query.pages) { 25 | var revisions = values(response.query.pages)[0].revisions; 26 | if (revisions) { 27 | opts.data = opts.data.concat(revisions); 28 | } else { 29 | log.warn("Didn't find anything for " + params.titles); 30 | } 31 | } 32 | if (response['continue']) { 33 | opts["continue"] = response.continue.rvcontinue; 34 | params.rvcontinue = response.continue.rvcontinue; 35 | getRevisions(params, opts, callback); 36 | } else { 37 | callback(opts.data); 38 | } 39 | }); 40 | } 41 | 42 | // retrieves all the revisions for a page, optionally between two dates (opts from/until). 43 | // Does NOT fire callback until all results retrieved 44 | module.exports.all = function(page, opts, callback) { 45 | if (arguments.length < 3) { 46 | callback = opts; 47 | opts = {}; 48 | } 49 | 50 | if (!opts.data) { 51 | opts.data = []; 52 | } 53 | 54 | opts["continue"] = opts["continue"] || null; 55 | 56 | var params = { 57 | format: "json", 58 | action: "query", 59 | prop: "revisions", 60 | rvlimit: 500, 61 | titles: page, 62 | rvprop: "ids|timestamp|user" 63 | }; 64 | 65 | if (opts.content) { 66 | params.rvprop += "|content"; 67 | } 68 | 69 | if (opts.wikitext) { 70 | params.rvprop += "|wikitext"; 71 | } 72 | 73 | if (opts.comment) { 74 | params.rvprop += "|comment"; 75 | } 76 | 77 | if (opts["continue"]) { 78 | params.rvcontinue = opts["continue"]; 79 | } 80 | 81 | if (opts.until) { 82 | params.rvstart = addDate(opts.until, 0); 83 | } 84 | 85 | if (opts.from) { 86 | params.rvend = addDate(opts.from, 0); 87 | } else if (opts.until && opts.span) { 88 | params.rvend = addDate(opts.until, -parseInt(opts.span, 10)); 89 | } 90 | 91 | getRevisions(params, opts, callback); 92 | } 93 | 94 | 95 | // get most recent revision to page or pages 96 | module.exports.one = function(pages, opts, callback) { 97 | if (arguments.length < 3) { 98 | callback = opts; 99 | opts = {}; 100 | } 101 | 102 | if (typeof pages === "string") { 103 | pages = [pages]; 104 | } 105 | 106 | for (var i = 0; i < pages.length; i += 50) { 107 | //http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=API|Main%20Page&rvprop=timestamp|user|comment|content 108 | var params = { 109 | format: "json", 110 | action: "query", 111 | prop: "revisions", 112 | rvparse: 1, 113 | titles: pages.slice(i, i + 50).join("|"), 114 | rvprop: opts.content ? "ids|timestamp|contentmodel=wikitext|content" : "ids|timestamp" 115 | }; 116 | 117 | dial(params, function(resp) { 118 | for (var page in resp.query.pages) { 119 | console.log(resp.query.pages[page]); 120 | callback(resp.query.pages[page]); 121 | } 122 | }); 123 | } 124 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-wikipedia", 3 | "version": "0.0.2", 4 | "author": "Chris Wilson ", 5 | "description": "Simple wrapper for MediaWiki API", 6 | "contributors": [], 7 | "main": "index.js", 8 | "repository": { 9 | "type": "git", 10 | "url": "https://github.com/TimeMagazine/node-wikipedia.git" 11 | }, 12 | "keywords": [ 13 | "wikipedia", 14 | "mediawiki", 15 | "api", 16 | "wrapper" 17 | ], 18 | "scripts": { 19 | "test": "test/test.js" 20 | }, 21 | "dependencies" : { 22 | "request" : ">= 2.3.0", 23 | "downcache" : ">= 0.0.2", 24 | "npmlog": "~0.0.6", 25 | "cheerio": "~0.16.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /test/output/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var wikipedia = require("../index"), 4 | fs = require("fs"); 5 | 6 | wikipedia.page.data("Clifford_Brown", { content: true }, function(response) { 7 | fs.writeFileSync(__dirname + "/output/Clifford_Brown.json", JSON.stringify(response, null, 2)); 8 | }); 9 | 10 | // Non-latin alphabets 11 | wikipedia.page.data("Бакунин_Михаил_Александрович", { content: true, lang: 'ru' }, function(response) { 12 | fs.writeFileSync(__dirname + "/output/Бакунин_Михаил_Александрович.json", JSON.stringify(response, null, 2)); 13 | }); 14 | 15 | wikipedia.revisions.all("Miles_Davis", { comment: true }, function(response) { 16 | fs.writeFileSync(__dirname + "/output/Miles_Davis_revisions.json", JSON.stringify(response, null, 2)); 17 | }); 18 | 19 | wikipedia.revisions.all("Buenaventura_Durruti", { comment: true, lang: 'es' }, function(response) { 20 | fs.writeFileSync(__dirname + "/output/Buenaventura_Durruti.json", JSON.stringify(response, null, 2)); 21 | }); 22 | 23 | wikipedia.categories.tree( 24 | "Philadelphia_Phillies", 25 | function(tree) { 26 | fs.writeFileSync(__dirname + "/output/Philadelphia_Phillies.json", JSON.stringify(tree, null, 2)); 27 | } 28 | ); 29 | 30 | 31 | --------------------------------------------------------------------------------