├── ex.js ├── package.json ├── test └── basic.js ├── README.md └── index.js /ex.js: -------------------------------------------------------------------------------- 1 | var crawl = require('./index') 2 | 3 | var opts = { 4 | repo: 'ipfs/js-ipfs', 5 | // auth: { 6 | // client_id: '', 7 | // client_secret: '' 8 | // } 9 | } 10 | 11 | // crawl('jbenet/random-ideas', function (err, graph) { 12 | // crawl('noffle/github-dependency-crawl', function (err, graph) { 13 | crawl(opts, function (err, graph) { 14 | if (err) return console.log(err) 15 | 16 | console.log(graph) 17 | }) 18 | 19 | 20 | /* 21 | * returns an object with keys denoting issues that map to a list of its dependencies 22 | { 23 | 'ipfs/go-ipfs/123': [ 'ipfs/go-ipfs/19', 'ipfs/js-ipfs/27' ], 24 | ... 25 | } 26 | */ 27 | 28 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "github-dependency-crawl", 3 | "version": "1.4.0", 4 | "description": "crawl github issues to build a dependency graph", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "tape test/*" 8 | }, 9 | "keywords": [ 10 | "crawl", 11 | "github", 12 | "dependency", 13 | "graph", 14 | "dependencies", 15 | "tree", 16 | "issue", 17 | "issues" 18 | ], 19 | "author": "Stephen Whitmore ", 20 | "license": "ISC", 21 | "dependencies": { 22 | "async": "^2.0.1", 23 | "request": "^2.74.0", 24 | "url-regexp": "^1.0.2" 25 | }, 26 | "devDependencies": { 27 | "tape": "^4.6.0" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /test/basic.js: -------------------------------------------------------------------------------- 1 | var crawl = require('../index') 2 | var test = require('tape') 3 | 4 | test('basic', function (t) { 5 | t.plan(1) 6 | 7 | var opts = { 8 | repo: 'noffle/github-dependency-crawl', 9 | repoToGitHubIssues: function (ownerRepo, cb) { 10 | process.nextTick(function () { 11 | if (ownerRepo === opts.repo) { 12 | return cb(null, [ 13 | { 14 | url: 'https://github.com/noffle/github-dependency-crawl/issues/1', 15 | body: 'Depends on https://github.com/noffle/talks/issues/13' 16 | } 17 | ]) 18 | } else { 19 | cb(null, []) 20 | } 21 | }) 22 | }, 23 | issueToGitHubIssue: function (issue, cb) { 24 | process.nextTick(function () { 25 | if (issue === 'noffle/talks/13') { 26 | return cb(null, { 27 | url: 'https://github.com/noffle/talks/issues/13', 28 | body: 'hi friends' 29 | }) 30 | } else { 31 | cb(null, []) 32 | } 33 | }) 34 | } 35 | } 36 | 37 | crawl(opts, function (err, graph) { 38 | if (err) t.fail(err) 39 | 40 | t.deepEqual(graph, { 41 | 'noffle/github-dependency-crawl/1': [ 'noffle/talks/13' ], 42 | 'noffle/talks/13': [] 43 | }) 44 | }) 45 | }) 46 | 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # github-dependency-crawl 2 | 3 | > Crawl GitHub issues to build a dependency graph. 4 | 5 | 6 | ## Usage 7 | 8 | Let's see what this very repository's dependency tree looks like: 9 | 10 | ```js 11 | var crawl = require('github-dependency-crawl') 12 | 13 | crawl('noffle/github-dependency-crawl', function (err, graph) { 14 | console.log(graph) 15 | }) 16 | ``` 17 | 18 | It'll look something like this: 19 | 20 | ``` 21 | { 22 | 'noffle/github-dependency-crawl/2': [ 'noffle/github-dependency-crawl/3' ], 23 | 'noffle/github-dependency-crawl/1': [ 'noffle/github-dependency-crawl/2', 'noffle/github-dependency-crawl/3' ], 24 | 'noffle/github-dependency-crawl/3': [ 'noffle/ipget/18' ], 25 | 'noffle/ipget/18': [ 'ipfs/ipget/24', 'ipfs/ipget/26', 'ipfs/ipget/20', 'ipfs/ipget/21' ], 26 | 'ipfs/ipget/24': [], 27 | 'ipfs/ipget/26': [], 28 | 'ipfs/ipget/20': [], 29 | 'ipfs/ipget/21': [] 30 | } 31 | ``` 32 | 33 | Where keys indicate issues in the graph, and each maps to a list of its 34 | dependencies. 35 | 36 | ## API 37 | 38 | ```js 39 | var crawl = require('github-dependency-crawl') 40 | ``` 41 | 42 | ### crawl(opts, cb) 43 | 44 | Asynchronously makes many GitHub API requests to crawl a given repository's 45 | dependency graph. 46 | 47 | To simply get the dependency graph of a repo, `opts` can be a string of the form 48 | `"org/repo"` for a single repo, or `"org"` to crawl all issues of all 49 | repositories in an organization. 50 | 51 | `cb` is of the form `function (err, graph)`. `graph` contains an object of the 52 | form 53 | 54 | ```js 55 | { 56 | issueName: [ issueName ], 57 | issueName: [ issueName ], 58 | ... 59 | } 60 | ``` 61 | 62 | where `issueName` is of the form `org/repo/issue-num` (e.g. 63 | `noffle/latest-tweets/1`). 64 | 65 | Keys are entries in the dependency graph, and the issues it maps to are its 66 | dependencies. 67 | 68 | For more flexible use, `opts` can be an object of the form 69 | 70 | ```js 71 | { 72 | repo: 'org/repo' || 'org', 73 | orgToRepos: function (orgName, cb) { ... }, 74 | repoToGitHubIssues: function (repoName, cb) { ... }, 75 | issueToGitHubIssues: function (issueName, cb) { ... }, 76 | auth: { 77 | client_id: '...', 78 | client_secret: '...' 79 | } 80 | } 81 | ``` 82 | 83 | `repoName` will be of the form `org/repo` and `issueName` of the form 84 | `org/repo/issue-num`. 85 | 86 | `auth` provides the option to include GitHub API credentials, to be able to make 87 | a higher # requests / hour. 88 | 89 | By default, the crawler will visit all pages of issues per-repo. 90 | 91 | If not supplied, `orgToRepos`, `repoToGitHubIssues` and `issueToGitHubIssues` 92 | will default to the built-in functionality of querying the GitHub API. These 93 | functions are overwritable here so that the module can a) be easily unit tested, 94 | and b) you can crawl your own offline datasets by e.g. substituting github api 95 | requests for local filesystem reads. 96 | 97 | 98 | ## Install 99 | 100 | With [npm](https://npmjs.org/) installed, run 101 | 102 | ``` 103 | $ npm install github-dependency-crawl 104 | ``` 105 | 106 | ## License 107 | 108 | ISC 109 | 110 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var request = require('request') 2 | var urlMatch = require('url-regexp').match 3 | var urlParse = require('url').parse 4 | var asyncReduce = require('async').reduce 5 | 6 | // TODO: consider using a github api module instead of http api directly 7 | 8 | module.exports = function (opts, cb) { 9 | if (typeof opts === 'string') { 10 | opts = { repo: opts } 11 | } 12 | 13 | if (!cb || typeof cb !== 'function') { 14 | throw new Error('no callback given') 15 | } 16 | 17 | if (!opts.repo) { 18 | throw new Error('missing first param "repo"') 19 | } 20 | 21 | // Parse the org/repo input 22 | var orgRepo = parseOrgRepoInput(opts.repo) 23 | if (!orgRepo) { 24 | throw new Error('malformed input; expected :org/:repo or :org') 25 | } 26 | opts.repo = orgRepo 27 | 28 | // Plug-and-play transform functions 29 | opts.orgToRepos = opts.orgToRepos || orgToRepos 30 | opts.repoToGitHubIssues = opts.repoToGitHubIssues || orgRepoToGitHubIssues 31 | opts.issueToGitHubIssue = opts.issueToGitHubIssue || issueToGitHubIssue 32 | 33 | // Aaand action! Recurse on the org or repo 34 | var numComponents = opts.repo.split('/').length 35 | if (numComponents === 1) { 36 | recursiveOrgNameToDependencyGraph(opts.repo, cb) 37 | } else if (numComponents === 2) { 38 | recursiveRepoNameToDependencyGraph(opts.repo, {}, cb) 39 | } else { 40 | throw new Error('repo must be "org" or "org/repo"') 41 | } 42 | 43 | 44 | // --------------------------------------------------------------------------------------- 45 | // Various helper functions, included in the same closure to retain the binding to 'opts'. 46 | // --------------------------------------------------------------------------------------- 47 | 48 | function recursiveOrgNameToDependencyGraph (org, cb) { 49 | "Asynchronously gets all issues from all GitHub repos of a GitHub organization and follows all out-of-repo links recursively, returning a full dependency graph for that organization." 50 | 51 | // Get all repos in the org 52 | orgToRepos(org, function (err, repos) { 53 | // console.log('got all repos', repos.length) 54 | 55 | asyncReduce(repos, {}, 56 | function reduce (graph, repo, callback) { 57 | recursiveRepoNameToDependencyGraph(repo, {}, function (err, graph2) { 58 | if (err) return callback(err) 59 | // console.log(' got repo', repo) 60 | callback(null, flatMerge(graph, graph2)) 61 | }) 62 | }, 63 | function done (err, res) { 64 | if (err) return cb(err) 65 | cb(null, res) 66 | }) 67 | }) 68 | } 69 | 70 | function recursiveRepoNameToDependencyGraph (repo, graph, cb) { 71 | "Asynchronously gets all issues from a GitHub repo and follows all out-of-repo links recursively, returning a full dependency graph for that repo." 72 | 73 | orgRepoToDependencyGraph(repo, function (err, graph2) { 74 | if (err) return cb(err) 75 | 76 | // console.log('repo ->', graph2) 77 | 78 | graph = flatMerge(graph, graph2) 79 | 80 | recursiveResolveGraph(graph, cb) 81 | }) 82 | } 83 | 84 | function recursiveResolveGraph (graph, cb) { 85 | "Asynchronously takes a partially resolved graph and looks up unresolved dependencies against GitHub until all are satisfied." 86 | 87 | var unresolved = getUnresolvedDependencies(graph) 88 | // console.log('unres', unresolved) 89 | 90 | // Base case; all is resolved already 91 | if (!unresolved.length) { 92 | return cb(null, graph) 93 | } 94 | 95 | // TODO: a possible optimization might be to check if there are e.g. > N 96 | // unresolved dependencies for a single :org/:repo tuple, and just do a 97 | // fetch of that repo's full issue set, filtering out what's not needed. 98 | asyncReduce(unresolved, graph, 99 | function reduce (graph, issue, callback) { 100 | // console.log('issue ->', issue) 101 | issueToDependencyGraph(issue, function (err, innerGraph) { 102 | // console.log('flatMerge', graph, innerGraph) 103 | callback(null, flatMerge(graph, innerGraph)) 104 | }) 105 | }, 106 | function done (err, res) { 107 | if (err) return cb(err) 108 | recursiveResolveGraph(res, cb) 109 | }) 110 | } 111 | 112 | function orgRepoToDependencyGraph (orgRepo, cb) { 113 | "Given a GitHub repo of the form ':org/:repo', returns a dependency graph." 114 | 115 | opts.repoToGitHubIssues(orgRepo, function (err, issues) { 116 | if (err) return cb(err) 117 | cb(null, githubIssuesToDependencyGraph(issues)) 118 | }) 119 | } 120 | 121 | 122 | function issueToDependencyGraph (issue, cb) { 123 | "Given an issue of the form ':org/:repo/:issue-num', returns a list of issues and their declared dependencies." 124 | 125 | opts.issueToGitHubIssue(issue, function (err, res) { 126 | if (err) return cb(err) 127 | 128 | var graph = githubIssuesToDependencyGraph([res]) 129 | 130 | // Deal with the case that we were redirected, lest infinite loops occur. 131 | // e.g. We ask for ipfs/ipget/1 but results refer to noffle/ipget/1 132 | var name = dependencyUrlToCanonicalName(res.url) 133 | if (name !== issue) { 134 | replaceInGraph(graph, name, issue) 135 | } 136 | 137 | cb(null, graph) 138 | }) 139 | } 140 | 141 | function orgRepoToGitHubIssues (orgRepo, cb) { 142 | "Given a string of the form :org/:repo, asynchronously retrives a list of GitHub API issues. Recursively steps through all pages of issues." 143 | 144 | var url = 'https://api.github.com/repos/' 145 | 146 | // Match freeform repo string to a GH url 147 | if (orgRepo.match(/[A-Za-z0-9-]+\/[A-Za-z0-9-]+/)) { 148 | url += orgRepo + '/issues' 149 | } else { 150 | throw new Error('unrecognized repo format. expected: org/repo') 151 | } 152 | 153 | // Get all issues (not just open ones). 154 | url += "?state=all" 155 | 156 | fetchIssuesPage(url, [], cb) 157 | 158 | 159 | function fetchIssuesPage (url, issuesAccum, cb) { 160 | "Recursively fetches subsequent pages of GitHub issues via the GitHub API." 161 | 162 | var ropts = { 163 | url: url, 164 | headers: { 165 | 'User-Agent': userAgent() 166 | } 167 | } 168 | if (opts.auth && opts.auth.client_id && opts.auth.client_secret) { 169 | ropts.url += '&client_id=' + opts.auth.client_id 170 | ropts.url += '&client_secret=' + opts.auth.client_secret 171 | } 172 | // console.error('request:', ropts.url) 173 | request(ropts, function (err, res, body) { 174 | // Bogus response 175 | if (err || res.statusCode !== 200) { 176 | // console.log(res) 177 | return cb(err || new Error('status code ' + res.statusCode)) 178 | } 179 | 180 | // Parse JSON response 181 | try { 182 | body = JSON.parse(body) 183 | } catch (err) { 184 | return cb(err) 185 | } 186 | 187 | // console.log(' got issues', body.length) 188 | 189 | issuesAccum = issuesAccum.concat(body) 190 | 191 | // Recursive pagination, or terminate 192 | if (res.headers['link']) { 193 | var links = parseLinkHeader(res.headers['link']) 194 | if (links['next']) { 195 | return fetchIssuesPage(links['next'], issuesAccum, cb) 196 | } 197 | } 198 | 199 | // Fall-through base case: no more pages 200 | // console.log('accum', issuesAccum) 201 | cb(null, issuesAccum) 202 | }) 203 | } 204 | } 205 | 206 | function orgToRepos (org, cb) { 207 | "Given a string of the form :org, retrieve a list of GitHub repo names." 208 | 209 | var url = 'https://api.github.com/orgs/' + org + '/repos' 210 | 211 | // Only grab repos the org actually 'owns'. 212 | url += '?type=source' 213 | 214 | fetchReposPage(url, [], cb) 215 | 216 | function fetchReposPage (url, reposAccum, cb) { 217 | "Recursively fetches subsequent pages of GitHub repos via the GitHub API." 218 | 219 | var ropts = { 220 | url: url, 221 | headers: { 222 | 'User-Agent': userAgent() 223 | } 224 | } 225 | if (opts.auth && opts.auth.client_id && opts.auth.client_secret) { 226 | ropts.url += '&client_id=' + opts.auth.client_id 227 | ropts.url += '&client_secret=' + opts.auth.client_secret 228 | } 229 | // console.error('request:', ropts.url) 230 | request(ropts, function (err, res, body) { 231 | // Bogus response 232 | if (err || res.statusCode !== 200) { 233 | return cb(err || new Error('status code ' + res.statusCode)) 234 | } 235 | 236 | // Parse JSON response 237 | try { 238 | body = JSON.parse(body) 239 | } catch (err) { 240 | return cb(err) 241 | } 242 | 243 | // Map results to canonical :org/:repo names 244 | body = body.map(function (repo) { 245 | return repo.full_name 246 | }) 247 | 248 | reposAccum = reposAccum.concat(body) 249 | 250 | // Recursive pagination, or terminate 251 | if (res.headers['link']) { 252 | var links = parseLinkHeader(res.headers['link']) 253 | if (links['next']) { 254 | return fetchReposPage(links['next'], reposAccum, cb) 255 | } 256 | } 257 | 258 | // Fall-through base case: no more pages 259 | // console.log('accum', reposAccum) 260 | cb(null, reposAccum) 261 | }) 262 | } 263 | } 264 | 265 | function issueToGitHubIssue (issue, cb) { 266 | "Given a string of the form :org/:repo/:issue, asynchronously retrieves the corresponding GitHub API issue." 267 | 268 | // Validate the input 269 | var components = issue.split('/') 270 | if (components.length !== 3) { 271 | throw new Error('malformed input; expected :org/:repo/:issue-num') 272 | } 273 | 274 | var org = components[0] 275 | var repo = components[1] 276 | var issueNum = components[2] 277 | 278 | // Retrieve the issue 279 | var ropts = { 280 | url: 'https://api.github.com/repos/' + org + '/' + repo + '/issues/' + issueNum, 281 | headers: { 282 | 'User-Agent': userAgent() 283 | } 284 | } 285 | if (opts.auth && opts.auth.client_id && opts.auth.client_secret) { 286 | ropts.url += '&client_id=' + opts.auth.client_id 287 | ropts.url += '&client_secret=' + opts.auth.client_secret 288 | } 289 | // console.error('request:', opts.url) 290 | request(ropts, function (err, res, body) { 291 | // Bogus response 292 | if (err || res.statusCode !== 200) { 293 | // console.log(res) 294 | return cb(err || new Error('status code ' + res.statusCode)) 295 | } 296 | 297 | // Parse JSON response 298 | try { 299 | body = JSON.parse(body) 300 | } catch (err) { 301 | return cb(err) 302 | } 303 | 304 | cb(null, body) 305 | }) 306 | } 307 | } 308 | 309 | function githubIssuesToDependencyGraph (issues) { 310 | "Given a list of GitHub API issues and returns a dep-graph with all newly discovered dependencies from the issues given." 311 | 312 | // Iterate over each GH API issue, extract its declared dependencies, and 313 | // return an array of objects, each of the form 314 | // { 315 | // 'noffle/ideas/1': [ 'ipfs/go-ipfs/123', 'ipfs/js-ipfs/99' ], 316 | // ... 317 | // } 318 | issues = filterMap(issues, function (issue) { 319 | var name = dependencyUrlToCanonicalName(issue.url) 320 | var orgRepo = name.split('/').slice(0, 2).join('/') 321 | var deps = filterMap( 322 | extractDependencyUrls(issue.body, orgRepo), 323 | dependencyUrlToCanonicalName) 324 | 325 | var res = {} 326 | res[name] = deps 327 | return res 328 | }) 329 | 330 | // Merge the individual issues together into a single object 331 | return issues 332 | .reduce(function (graph, issue) { 333 | var name = Object.keys(issue)[0] 334 | graph[name] = issue[name] 335 | return graph 336 | }, {}) 337 | } 338 | 339 | function getUnresolvedDependencies (graph) { 340 | "Finds all issues that are referenced by the graph but not contained in it." 341 | 342 | return Object.keys(graph) 343 | .reduce(function (issues, key) { 344 | // all referenced deps that don't exist in the graph 345 | var unresolved = graph[key].filter(function (d) { 346 | return graph[d] === undefined 347 | }) 348 | 349 | return issues.concat(unresolved) 350 | }, []) 351 | } 352 | 353 | function extractDependencyUrls (string, orgRepo) { 354 | "Given a freeform multi-line string, extract all dependencies as URLs. If an optional 'orgRepo' string is given (e.g. noffle/latest-tweets), dependency strings of the form 'Depends on #24' can be resolved to the current repo." 355 | 356 | if (!string) { 357 | return [] 358 | } 359 | 360 | // TODO: assumes \r\n newlines, which is correct *today*, but in THE FUTURE? 361 | // iterate over lines in the body 362 | return filterMap(string.split('\r\n'), function (line) { 363 | // match 'depends on' prefix 364 | if (line.match(/^Depends on http/)) { 365 | // extract url 366 | var urls = urlMatch(line) 367 | if (urls.length === 1) { 368 | return urls[0] 369 | } 370 | } else if (orgRepo && line.match(/^Depends on #(\d+)/)) { 371 | // extract issue-num 372 | var issueNum = line.match(/^Depends on #(\d+)/)[1] 373 | return 'https://github.com/' + orgRepo + '/issues/' + issueNum 374 | } 375 | return false 376 | }) 377 | } 378 | 379 | function dependencyUrlToCanonicalName (url) { 380 | "Converts a GitHub URL to canonical :org/:repo/:issue-num form, or null if no such form could be extracted." 381 | 382 | // "url": "https://api.github.com/repos/jbenet/random-ideas/issues/37", 383 | 384 | var parsed = urlParse(url) 385 | if (parsed && parsed.protocol && parsed.path) { 386 | var components = parsed.path.split('/') 387 | // https://www.github.com/OWNER/REPO/issues/NUM 388 | if (components.length === 5 && components[0] === '' && components[3] === 'issues') { 389 | return components[1] + '/' + components[2] + '/' + components[4] 390 | } 391 | // https://api.github.com/repos/OWNER/REPO/issues/NUM 392 | else if (components.length === 6 && components[1] === 'repos' && components[4] === 'issues') { 393 | return components[2] + '/' + components[3] + '/' + components[5] 394 | } 395 | } 396 | 397 | return null 398 | } 399 | 400 | function userAgent () { 401 | "Produces a User-Agent string using the package's name and version, of the form NAME/VERSION." 402 | 403 | var package = require(require('path').join(__dirname, 'package.json')) 404 | return package.name + '/' + package.version 405 | } 406 | 407 | function replaceInGraph (graph, from, to) { 408 | "In-place graph mutation, where all instances of 'from' are replaced with 'to'." 409 | 410 | Object.keys(graph) 411 | .forEach(function (key) { 412 | // replace top-level key 413 | if (key === from) { 414 | graph[to] = graph[from] 415 | delete graph[from] 416 | key = to 417 | } 418 | 419 | // replace occurrences in dependencies 420 | graph[key] = graph[key].map(function (dep) { 421 | return (dep === from) ? to : dep 422 | }) 423 | }) 424 | } 425 | 426 | function parseOrgRepoInput (input) { 427 | "Takes a string and produces a string of the form :org/:repo or :org. If the\ 428 | string is an HTTP(S) GitHub URL, it will be parsed and reduced the\ 429 | aforementioned form. Returns null if parsing is not successful." 430 | 431 | // Validate the resultant repo structure 432 | var components = input.split('/') 433 | if (components.length === 2 || components.length === 1) { 434 | return input 435 | } 436 | 437 | return null 438 | } 439 | 440 | function parseLinkHeader (header) { 441 | `Given a GitHub 'Link' header string, parses it and returns an object mapping 442 | 'rel' names to URLs. 443 | 444 | '; rel="next", ; rel="last"' 445 | 446 | would map to 447 | 448 | { 449 | "next": "https://api.github.com/repositories/20312497/issues?page=2" 450 | "last": "https://api.github.com/repositories/20312497/issues?page=10" 451 | } 452 | ` 453 | 454 | var res = {} 455 | 456 | var regex = /<(.*?)>; rel="(\w+)"/ 457 | var match 458 | while (match = header.match(regex)) { 459 | var url = match[1] 460 | var name = match[2] 461 | res[name] = url 462 | header = header.substring(match[0].length) 463 | } 464 | 465 | return res 466 | } 467 | 468 | function filterMap (list, func) { 469 | "Runs a mapping function 'func' over a list, filtering out elements that are mapped to a non-truthy value." 470 | 471 | return list.map(function (item) { 472 | return func(item) 473 | }).filter(function (item) { 474 | return item 475 | }) 476 | } 477 | 478 | function flatMerge (a, b) { 479 | "Merge two objects together shallowly. On key conflicts, b wins." 480 | 481 | return Object.keys(b) 482 | .reduce(function (result, key) { 483 | result[key] = b[key] 484 | return result 485 | }, a) 486 | } 487 | --------------------------------------------------------------------------------