├── .gitignore ├── LICENSE ├── README.md ├── findUsersWithFollowers.js ├── index.js ├── indexUserFollowers.js ├── layout.js ├── lib ├── githubClient.js ├── githubRequest.js ├── loadGraph.js └── redisClient.js ├── makeFollowersGraph.js ├── package.json ├── redisNames.js └── toBinary.js /.gitignore: -------------------------------------------------------------------------------- 1 | lib-cov 2 | *.seed 3 | *.log 4 | *.csv 5 | *.dat 6 | *.out 7 | *.pid 8 | *.gz 9 | *.dot 10 | 11 | pids 12 | logs 13 | results 14 | 15 | npm-debug.log 16 | node_modules 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2024 Andrei Kashcha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # allgithub 2 | 3 | Crawling github data for https://github.com/anvaka/pm/ 4 | 5 | # usage 6 | 7 | ## Prerequisites: 8 | 9 | 1. Make sure redis is installed and running on default port 10 | 2. [Register github token](https://help.github.com/articles/creating-an-access-token-for-command-line-use/) 11 | and set it into `GH_TOKEN` environment variable. 12 | 3. Install the crawler: 13 | 14 | ``` 15 | git clone https://github.com/anvaka/ghcrawl 16 | cd ghcrawl 17 | npm i 18 | ``` 19 | 20 | Now we are ready to index. 21 | 22 | ## Find all users with more than 2 followers 23 | 24 | This will use a search API and will go through all users on GitHub who have more 25 | than two followers. At 26 | the moment there are [more than 400k users](https://github.com/search?q=followers%3A%3E2&type=Users&utf8=%E2%9C%93). 27 | 28 | Each search request can return up to 100 records per page, which gives us 29 | `400,000 / 100 = 4,000` requests to make. Search API is rate limited at 30 30 | requests per minute. Which means the indexing will take `4,000/30 = 133` - 31 | more than two hours: 32 | 33 | ``` 34 | node findUsersWithFollowers.js 35 | ``` 36 | 37 | ## Find all followers 38 | 39 | Now that we have all users who have more than two followers, let's index 40 | those followers. Bad news we will have to make one request per user. 41 | Good news, rate limit is 5,000 requests per hour, which gives us estimated 42 | amount of work: `400,000/5,000 = 80` - more than 80 hours of work: 43 | 44 | ``` 45 | node indexUserFollowers.js 46 | ``` 47 | 48 | ## Time to get the graph 49 | 50 | Now that we have all users indexed, we can construct the graph: 51 | 52 | ``` 53 | node makeFollowersGraph.js > github.dot 54 | ``` 55 | 56 | # Layout 57 | 58 | Convert graph to binary format: 59 | 60 | ``` 61 | node --max-old-space-size=4096 ./toBinary.js 62 | ``` 63 | 64 | Then use [ngraph.native](https://github.com/anvaka/ngraph.native) for faster 65 | graph layout. 66 | 67 | # license 68 | 69 | MIT 70 | -------------------------------------------------------------------------------- /findUsersWithFollowers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Since github has more than 14 million users, it's not feasible to make 3 | * request to /users/[name]/followers api for each user (would take several 4 | * months to finish under 5k requests per hour limit). 5 | * 6 | * Instead we are trying to find all users who has at least N followers (N === 3 7 | * at the moment) using search API, and store them to JOINED_AFTER hash. 8 | */ 9 | 10 | var githubClient = require('./lib/githubClient.js')(process.env.GH_TOKEN); 11 | var redisClient = require('./lib/redisClient.js')(); 12 | var config = require('./redisNames.js'); 13 | 14 | redisClient.get(config.LAST_FOLLOWER_TIME) 15 | .then(greet) 16 | .then(indexUsers); 17 | 18 | function greet(after) { 19 | console.log('Welcome to the crawler of users who has followers.'); 20 | if (after) { 21 | console.log('Attemtpting to resume from "joined date": ' + after); 22 | } 23 | 24 | return after; 25 | } 26 | 27 | function indexUsers(after) { 28 | githubClient.getUsersWhoJoinedAfter(after) 29 | .then(save) 30 | .then(loadMore) 31 | .catch(function(e) { 32 | console.log('Something went bad: ' + e); 33 | console.log('Quiting...'); 34 | process.exit(-1); 35 | }); 36 | } 37 | 38 | function loadMore(ctx) { 39 | if (ctx.isDone) { 40 | console.log('All is done.'); 41 | redisClient.close(); 42 | return; 43 | } 44 | 45 | githubClient.getWhenUserJoined(ctx.lastSavedId) 46 | .then(saveLastTimeSamp) 47 | .then(indexUsers); 48 | } 49 | 50 | function save(users) { 51 | redisClient.saveToSet(config.JOINED_AFTER, users); 52 | var lastSavedUser = users[users.length - 1]; 53 | console.log('last saved user: ' + lastSavedUser); 54 | 55 | return { 56 | isDone: users.length < 100, // this can only happen if we reached the last page 57 | lastSavedId: lastSavedUser 58 | }; 59 | } 60 | 61 | function saveLastTimeSamp(stamp) { 62 | redisClient.set(config.LAST_FOLLOWER_TIME, stamp); 63 | return stamp; 64 | } 65 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This module simply walks through entire `/users` endpoint and stores every 3 | * user into redis. Key is user name, value is a map of user properties. 4 | * 5 | * The map by defaul contains only user id (a number). Subsequent crawlers will 6 | * fill in fields of the map (e.g. followers, starred repositories, etc). 7 | * 8 | * @see https://developer.github.com/v3/users/#get-all-users 9 | */ 10 | 11 | var githubClient = require('./lib/githubClient.js')(process.env.GH_TOKEN); 12 | var redisClient = require('./lib/redisClient.js')(); 13 | var config = require('./redisNames.js'); 14 | 15 | redisClient.get(config.LAST_SAVED_ID) 16 | .then(greetUser) 17 | .then(indexUsers); 18 | 19 | function greetUser(since) { 20 | console.log('Welcome to the github users crawler!'); 21 | if (since) { 22 | since = parseInt(since, 10); 23 | console.log('Attemtpting to resume indexing since user id: ' + since); 24 | } 25 | 26 | return since; 27 | } 28 | 29 | function indexUsers(since) { 30 | githubClient.getUsers(since) 31 | .then(save) 32 | .then(loadMore) 33 | .catch(function(e) { 34 | console.log('Something went bad: ' + e); 35 | console.log('Quiting...'); 36 | process.exit(-1); 37 | }); 38 | } 39 | 40 | function loadMore(ctx) { 41 | if (ctx.isDone) { 42 | console.log('All is done.'); 43 | redisClient.close(); 44 | return; 45 | } 46 | indexUsers(ctx.lastSavedId); 47 | } 48 | 49 | function save(users) { 50 | var lastSavedId = redisClient.saveUsers(users); 51 | redisClient.set(config.LAST_SAVED_ID, lastSavedId); 52 | 53 | console.log('last saved id: ' + lastSavedId); 54 | 55 | return { 56 | isDone: users.length < 100, // this can only happen if we reached the last page 57 | lastSavedId: lastSavedId 58 | }; 59 | } 60 | -------------------------------------------------------------------------------- /indexUserFollowers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This module uses data created by `findUsersWithFollowers.js` and for each 3 | * individual it issues request to `/users//followers`, and stores them 4 | * into redis (under followers map) 5 | */ 6 | var githubClient = require('./lib/githubClient.js')(process.env.GH_TOKEN); 7 | var redisClient = require('./lib/redisClient.js')(); 8 | var config = require('./redisNames.js'); 9 | 10 | redisClient.get(config.BEING_INDEXED_USER_FOLLOWERS) 11 | .then(greet) 12 | .then(indexUserFollowers); 13 | 14 | function greet(lastUser) { 15 | console.log('Welcome to the followers crawler.'); 16 | if (lastUser) { 17 | console.log('Attemtpting to resume from: ' + lastUser); 18 | return lastUser; 19 | } else { 20 | return startNextUser(); 21 | } 22 | } 23 | 24 | function indexUserFollowers(userName) { 25 | if (userName === undefined) { 26 | console.log('Are we done? Looks like there are no more users!'); 27 | process.exit(0); 28 | return; 29 | } 30 | console.log('Indexing followers of ' + userName); 31 | 32 | githubClient.getFollowers(userName) 33 | .then(save) 34 | .then(loadMore) 35 | .catch(function(e) { 36 | console.log('Something went bad: ' + e); 37 | console.log('Quiting...'); 38 | process.exit(-1); 39 | }); 40 | 41 | function save(followers) { 42 | console.log('Saving ' + followers.length + ' followers'); 43 | return redisClient.saveToHash(userName, { 44 | followers: followers 45 | }); 46 | } 47 | } 48 | 49 | function loadMore() { 50 | startNextUser().then(indexUserFollowers); 51 | } 52 | 53 | 54 | function startNextUser() { 55 | return redisClient.popFromSet(config.JOINED_AFTER).then(markUser); 56 | } 57 | 58 | function markUser(user) { 59 | redisClient.set(config.BEING_INDEXED_USER_FOLLOWERS, user); 60 | return user; 61 | } 62 | -------------------------------------------------------------------------------- /layout.js: -------------------------------------------------------------------------------- 1 | var graph = require('./lib/loadGraph.js')(); 2 | var layout = require('ngraph.offline.layout')(graph); 3 | 4 | console.log('Starting layout. This will take a while...'); 5 | layout.run(); 6 | 7 | console.log('Done. Now export this to binary format:'); 8 | console.log('node toBinary.js') 9 | -------------------------------------------------------------------------------- /lib/githubClient.js: -------------------------------------------------------------------------------- 1 | module.exports = githubClient; 2 | var githubRequest = require('./githubRequest.js'); 3 | 4 | function githubClient(token) { 5 | var tokenPart = ''; 6 | if (token) tokenPart = 'access_token=' + token + '&'; 7 | var USERS = 'https://api.github.com/users?per_page=100&' + tokenPart + 'since='; 8 | var USER_DETAILS = 'https://api.github.com/users/'; 9 | var SEARCH_USER_WITH_FOLLOWERS = 'https://api.github.com/search/users?' + tokenPart + 'per_page=100&sort=joined&order=asc&q='; 10 | 11 | return { 12 | getUsers: getUsers, 13 | getWhenUserJoined: getWhenUserJoined, 14 | getUsersWhoJoinedAfter: getUsersWhoJoinedAfter, 15 | getFollowers: getFollowers 16 | }; 17 | 18 | function getFollowers(user) { 19 | if (typeof user !== 'string') throw new Error('User has to be identified by login'); 20 | 21 | var followersArg = createRequestArgs(USER_DETAILS + user + '/followers?per_page=100&' + tokenPart); 22 | 23 | return githubRequest(followersArg, true) 24 | .then(combineFollowers) 25 | .catch(handleError); 26 | 27 | function handleError(reason) { 28 | if (reason.statusCode === 404) { 29 | console.log('WARNING: User ' + user + ' is not found'); 30 | return []; 31 | } 32 | throw reason; 33 | } 34 | } 35 | 36 | function combineFollowers(results) { 37 | var allFollowers = []; 38 | for (var i = 0; i < results.length; ++i) { 39 | var items = results[i]; 40 | for (var j = 0; j < items.length; j++) { 41 | var item = items[j]; 42 | allFollowers.push(item.login); 43 | } 44 | } 45 | return allFollowers; 46 | } 47 | 48 | function getUsersWhoJoinedAfter(date, minFollowers) { 49 | if (typeof minFollowers !== 'number') minFollowers = 3; 50 | if (typeof date !== 'string') date = '2005-01-01'; 51 | 52 | var searchArgs = createRequestArgs(SEARCH_USER_WITH_FOLLOWERS + 53 | 'created:>' + date + 54 | ' followers:>=' + minFollowers); 55 | 56 | return githubRequest(searchArgs, true).then(combineResults); 57 | 58 | function combineResults(results) { 59 | var allResults = []; 60 | for (var i = 0; i < results.length; ++i) { 61 | var items = results[i].items; 62 | for (var j = 0; j < items.length; j++) { 63 | var item = items[j]; 64 | allResults.push(item.login); 65 | } 66 | } 67 | return allResults; 68 | } 69 | } 70 | 71 | function getWhenUserJoined(userName) { 72 | console.log('Loading user\'s join date: ' + userName); 73 | var detailsRequest = createRequestArgs(USER_DETAILS + userName + '?' + tokenPart); 74 | return githubRequest(detailsRequest).then(getTime); 75 | 76 | function getTime(user) { 77 | return user.created_at; 78 | } 79 | } 80 | 81 | function getUsers(since) { 82 | if (typeof since !== 'number') { 83 | console.log('`since` argument is not present. Assuming 0'); 84 | since = 0; 85 | } 86 | 87 | var usersRequest = createRequestArgs(USERS + since); 88 | console.log('Loading users since ' + since); 89 | 90 | return githubRequest(usersRequest); 91 | } 92 | } 93 | 94 | function createRequestArgs(uri) { 95 | return { 96 | uri: uri, 97 | resolveWithFullResponse: true, 98 | headers: { 99 | 'User-Agent': 'anvaka/ghcrawl' 100 | } 101 | }; 102 | } 103 | -------------------------------------------------------------------------------- /lib/githubRequest.js: -------------------------------------------------------------------------------- 1 | module.exports = githubRequest; 2 | 3 | var P = require("bluebird"); 4 | var request = require('request-promise'); 5 | var errors = require('request-promise/errors'); 6 | 7 | function githubRequest(options, followNext) { 8 | var allPages; 9 | var failedCount = 0; 10 | 11 | if (followNext) allPages = []; 12 | 13 | return makeRequest(options); 14 | 15 | function makeRequest(options) { 16 | return request(options) 17 | .then(verifyRateLimits) 18 | .catch(errors.StatusCodeError, handleStatusCode); 19 | } 20 | 21 | function handleStatusCode(reason) { 22 | failedCount += 1; 23 | if (reason.statusCode === 403) { 24 | failedCount = 0; 25 | var headers = reason.response.headers; 26 | return getRateLimitPromiseFromHeaders(headers); 27 | } else if (reason.statusCode == 404) { 28 | failedCount = 0; 29 | throw reason; 30 | } if (failedCount < 5) { 31 | console.log('Got bad code, retry #' + failedCount); 32 | return makeRequest(options); 33 | } 34 | console.log('Bad code', reason); 35 | throw new Error('Too many status errors, quiting'); 36 | } 37 | 38 | function verifyRateLimits(response) { 39 | var rateLimitPromise = getRateLimitPromiseFromHeaders(response.headers); 40 | if (rateLimitPromise) return rateLimitPromise; 41 | var pageResults = JSON.parse(response.body); 42 | if (followNext) allPages.push(pageResults); 43 | 44 | var nextLink = followNext && getNextFormLink(response.headers.link); 45 | if (nextLink) { 46 | options.uri = nextLink; 47 | return makeRequest(options); 48 | } 49 | 50 | return followNext ? allPages : pageResults; 51 | } 52 | 53 | function getRateLimitPromiseFromHeaders(headers) { 54 | var rateLimit = parseRateLimit(headers); 55 | console.log('Rate limit: ' + rateLimit.limit + '/' + rateLimit.remaining); 56 | if (rateLimit.remaining === 0) { 57 | var waitTime = rateLimit.reset - new Date(); 58 | if (waitTime < 0) { 59 | // This happens sometimes. Github caches rate limits? 60 | // Anyway, wait four seconds - it should clear. 61 | waitTime = 4000; 62 | } 63 | console.log('Rate limit exceeded, waiting before retry: ' + waitTime + 'ms'); 64 | console.log('Current time is ' + (new Date()) + '; Reset: ' + (new Date(rateLimit.reset))); 65 | return P.delay(waitTime).then(resume); 66 | } 67 | } 68 | 69 | function resume() { 70 | return makeRequest(options); 71 | } 72 | } 73 | 74 | function parseRateLimit(headers) { 75 | var resetUTC = parseInt(headers['x-ratelimit-reset'], 10) * 1000; 76 | 77 | return { 78 | limit: parseInt(headers['x-ratelimit-limit'], 10), 79 | remaining: parseInt(headers['x-ratelimit-remaining'], 10), 80 | reset: resetUTC 81 | }; 82 | } 83 | 84 | function getNextFormLink(link) { 85 | if (typeof link !== 'string') return; 86 | var linkMatch = link.match(/<(.+)>; rel="next"/); 87 | 88 | return linkMatch && linkMatch[1]; 89 | } 90 | -------------------------------------------------------------------------------- /lib/loadGraph.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var fromdot = require('ngraph.fromdot'); 3 | 4 | module.exports = loadGraph; 5 | function loadGraph() { 6 | var fileName = process.argv[2] || './github.dot'; 7 | console.log('Loading graph from ' + fileName); 8 | var content = fs.readFileSync(fileName, 'utf8'); 9 | var graph = fromdot(content); 10 | console.log('Loaded ' + graph.getLinksCount() + ' edges; ' + graph.getNodesCount() + ' vertices;'); 11 | return graph; 12 | } 13 | -------------------------------------------------------------------------------- /lib/redisClient.js: -------------------------------------------------------------------------------- 1 | // todo: this should probably be split into two files 2 | var Redis = require('ioredis'); 3 | 4 | module.exports = redisClient; 5 | 6 | function redisClient() { 7 | var redis = new Redis(); 8 | redis.defineCommand('getAllUsers', { 9 | lua: [ 10 | "local result = {}", 11 | "for i, v in ipairs(KEYS) do", 12 | // Keys with '_' prefix are reserved, and do not represent users: 13 | " if string.sub(KEYS[i], 1, 1) ~= '_' then", 14 | " local user = redis.call('hgetall', KEYS[i])", 15 | " table.insert(user, 'login')", 16 | " table.insert(user, KEYS[i])", 17 | " result[i] = user", 18 | " end", 19 | "end", 20 | "return result" 21 | ].join('\n') 22 | }); 23 | 24 | return { 25 | /** 26 | * Save array of users and returns maximum seen id 27 | */ 28 | saveUsers: saveUsers, 29 | 30 | /** 31 | * Close the connection and dispose the client 32 | */ 33 | close: close, 34 | 35 | /** 36 | * Set key value into redis 37 | * 38 | * @param {string} key where we store value 39 | * @param {string|number} value that we want to store 40 | */ 41 | set: set, 42 | 43 | /** 44 | * Get value at given key 45 | * 46 | * @param {string} key to the value. 47 | * @returns promise that resolves with the value. 48 | */ 49 | get: get, 50 | 51 | /** 52 | * Adds all values as hash values for the key 53 | * 54 | * @param {string} key 55 | * @param {array} values - array of values that become values of the hash 56 | */ 57 | saveToSet: saveToSet, 58 | 59 | /** 60 | * Removes a random element from a set at given key 61 | * 62 | * @param {string} key 63 | * @returns promise that resolves with the element. 64 | */ 65 | popFromSet: popFromSet, 66 | 67 | saveToHash: saveToHash, 68 | 69 | getHash: getHash, 70 | 71 | forEachUser: forEachUser 72 | }; 73 | 74 | function forEachUser(callback, done) { 75 | 76 | getChunk(0); 77 | 78 | function getChunk(from) { 79 | redis.scan(from).then(processChunk); 80 | } 81 | 82 | function processChunk(chunk) { 83 | var cursor = parseInt(chunk[0], 10); 84 | var logins = chunk[1]; 85 | // getAllUsers for each user returns an array of attributes e.g.: 86 | // [ 87 | // [ 'id', '762', 'login', 'asanghi' ], 88 | // [ 'id', '877', 'login', 'larssg' ], .. 89 | // ] 90 | // Here we are mapping it to array of users: 91 | // [ { id: 762, login: 'asanghi'}, .. } 92 | return redis.getAllUsers(logins.length, logins) 93 | .then(mapToObjects) 94 | .then(reportToClient) 95 | .then(getNextChunk); 96 | 97 | function getNextChunk() { 98 | if (cursor !== 0) getChunk(cursor); 99 | else done(); 100 | } 101 | } 102 | 103 | function reportToClient(users) { 104 | users.forEach(callback); 105 | } 106 | } 107 | 108 | function getHash(key) { 109 | return redis.hgetall(key); 110 | } 111 | 112 | function saveToHash(key, properties) { 113 | return redis.hmset(key, properties); 114 | } 115 | 116 | function saveToSet(key, values) { 117 | return redis.sadd(key, values); 118 | } 119 | 120 | function popFromSet(key) { 121 | return redis.spop(key); 122 | } 123 | 124 | function set(key, value) { 125 | return redis.set(key, value); 126 | } 127 | 128 | function get(key) { 129 | return redis.get(key); 130 | } 131 | 132 | function close() { 133 | redis.disconnect(); 134 | } 135 | 136 | function saveUsers(users) { 137 | if (!users || typeof users.length !== 'number') throw new Error('Invalid users object: ' + users); 138 | 139 | var pipeline = redis.pipeline(); 140 | var maxId = 0; 141 | for (var i = 0; i < users.length; ++i) { 142 | var user = users[i]; 143 | pipeline.hmset(user.login, { 144 | id: user.id 145 | }); 146 | if (user.id > maxId) maxId = user.id; 147 | } 148 | 149 | pipeline.exec(logIfError); 150 | 151 | return maxId; 152 | } 153 | } 154 | 155 | function logIfError(err, results) { 156 | if (err) { 157 | console.log('ERROR: ' + err, results); 158 | throw (err); 159 | } 160 | } 161 | 162 | function mapToObjects(results) { 163 | return results.map(toObjects); 164 | } 165 | 166 | function toObjects(attributesArray) { 167 | var object = Object.create(null); 168 | for (var i = 0; i < attributesArray.length; i += 2) { 169 | object[attributesArray[i]] = attributesArray[i + 1]; 170 | } 171 | return object; 172 | } 173 | -------------------------------------------------------------------------------- /makeFollowersGraph.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This file will traverse the redis database and compse a graph of all followers 3 | */ 4 | var redisClient = require('./lib/redisClient.js')(); 5 | 6 | console.log('digraph GithubFollowers {'); 7 | redisClient.forEachUser(considerAddToGraph, quit); 8 | 9 | function considerAddToGraph(user) { 10 | // we skip users without followers for now 11 | if (user.followers) { 12 | addUser(user); 13 | } 14 | } 15 | 16 | function quit() { 17 | console.log('}'); 18 | redisClient.close(); 19 | } 20 | 21 | function addUser(user) { 22 | var followers = user.followers.split(','); 23 | followers.forEach(addLink); 24 | 25 | function addLink(follower) { 26 | console.log('"' + follower + '"->"' + user.login +'"'); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ghcrawl", 3 | "version": "1.0.0", 4 | "description": "Crawling github data", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "keywords": [ 10 | "github", 11 | "crawler" 12 | ], 13 | "author": "Andrei Kashcha", 14 | "license": "MIT", 15 | "repository": { 16 | "type": "git", 17 | "url": "https://github.com/anvaka/ghcrawl" 18 | }, 19 | "dependencies": { 20 | "bluebird": "^2.9.25", 21 | "ioredis": "^1.2.5", 22 | "ngraph.fromdot": "^0.1.3", 23 | "ngraph.graph": "0.0.11", 24 | "ngraph.offline.layout": "^1.0.0", 25 | "request-promise": "^0.4.2" 26 | }, 27 | "devDependencies": { 28 | "ngraph.todot": "^0.1.3" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /redisNames.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | // User by index.js to store index of the last saved user (`since` argument) 3 | LAST_SAVED_ID: '_lastSavedId', 4 | 5 | /** 6 | * User by findUsersWithFollowers.js to store maximum `created` time of the 7 | * found users. This time is later used as a start time for the next search. 8 | * Remember, github's search is limited by 1k results, thus we have to find 9 | * search invariant that allows us to iterate over millions of users. This 10 | * `created` time represents such invariant. 11 | */ 12 | LAST_FOLLOWER_TIME: '_lastFollowerTime2', 13 | 14 | /** 15 | * Where findUsersWithFollowers stores all users with followers, it has to be 16 | * a set, since we don't want to have duplicates from overlapping searches. 17 | */ 18 | JOINED_AFTER: '_joinedAfterV2', 19 | 20 | /** 21 | * The followers crawler will pop a user from JOINED_AFTER set and temporary 22 | * store him or here here. So that if the programm is interrupted we can 23 | * resume without loosing current user 24 | */ 25 | BEING_INDEXED_USER_FOLLOWERS: '_userBeingIndexedForFollowers' 26 | }; 27 | -------------------------------------------------------------------------------- /toBinary.js: -------------------------------------------------------------------------------- 1 | var graph = require('./lib/loadGraph.js')(); 2 | console.log('Done, loaded ' + graph.getLinksCount() + ' edges; ' + graph.getNodesCount() + ' nodes'); 3 | var save = require('ngraph.tobinary'); 4 | save(graph, { outDir: './data' }); 5 | --------------------------------------------------------------------------------