├── .github
├── dependabot.yml
└── workflows
│ └── ci.yml
├── .gitignore
├── README.md
├── config
└── repos.js
├── examples
├── .gitignore
├── data
│ └── ___next_page.txt
├── get_profile.js
├── index.html
├── list-repos.js
└── stars-recursive-scrape-save.js
├── index.js
├── lambda
├── debug.js
├── http_request.js
└── s3.js
├── lib
├── feed.js
├── followers.js
├── http_request.js
├── index.js
├── issue.js
├── issues.js
├── issues_search.js
├── labels.js
├── milestones.js
├── next_page.js
├── next_page_beta.js
├── org.js
├── org_repos.js
├── people.js
├── profile.js
├── profile_contribs.js
├── repo.js
├── repos.js
├── repos_user.js
├── scrapers.js
├── starred.js
├── stars_watchers.js
├── switcher.js
├── url_validator.js
└── utils.js
├── package-lock.json
├── package.json
└── test
├── e2e.test.js
├── feed.test.js
├── fixtures
├── dwyl-tudo-issue-51-api-comments.json
├── dwyl-tudo-issue-51-api.json
├── dwyl-tudo-issue-51-scrape.json
└── dwyl-tudo-issue-51.html
├── followers.test.js
├── following.test.js
├── http_request.test.js
├── issue.test.js
├── issues.test.js
├── issues_search.test.js
├── labels.test.js
├── milestones.test.js
├── org.test.js
├── people.test.js
├── profile.test.js
├── repo.test.js
├── repos.test.js
├── starred.test.js
├── stars.test.js
├── switcher.test.js
├── url_validator.test.js
└── utils.test.js
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: npm
4 | directory: "/"
5 | schedule:
6 | interval: weekly
7 | time: "17:00"
8 | timezone: Europe/London
9 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | # This workflow will do a clean install of node dependencies, cache/restore them, build the source code and run tests across different versions of node
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions
3 |
4 | name: Node.js CI
5 |
6 | on:
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | strategy:
18 | matrix:
19 | node-version: [18.x, 20.x]
20 | # See supported Node.js release schedule at https://nodejs.org/en/about/releases/
21 |
22 | steps:
23 | - uses: actions/checkout@v2
24 | - name: Use Node.js ${{ matrix.node-version }}
25 | uses: actions/setup-node@v2
26 | with:
27 | node-version: ${{ matrix.node-version }}
28 | cache: 'npm'
29 | - run: npm ci
30 | # - run: npm run build --if-present
31 | - run: npm test
32 | - name: Upload coverage to Codecov
33 | uses: codecov/codecov-action@v4
34 | with:
35 | token: ${{ secrets.CODECOV_TOKEN }}
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 |
5 | # Runtime data
6 | pids
7 | *.pid
8 | *.seed
9 |
10 | # Directory for instrumented libs generated by jscoverage/JSCover
11 | lib-cov
12 |
13 | # Coverage directory used by tools like istanbul
14 | coverage
15 |
16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
17 | .grunt
18 |
19 | # Compiled binary addons (http://nodejs.org/api/addons.html)
20 | build/Release
21 |
22 | # Dependency directory
23 | # Commenting this out is preferred by some people, see
24 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git-
25 | node_modules
26 |
27 | # Users Environment Variables
28 | .lock-wscript
29 | .vagrant
30 | crawl.js
31 | .DS_Store
32 |
33 | .env
34 | tmp/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # :octocat: 🕷 🕸 GitHub Scraper
4 |
5 | Learn how to parse the DOM of a web page
6 | by using your favourite coding community as an example.
7 |
8 | [](https://github.com/nelsonic/github-scraper/actions)
9 | [](http://codecov.io/github/nelsonic/github-scraper?branch=master)
10 | [](https://github.com/nelsonic/github-scraper/issues)
11 | [](https://hits.dwyl.com/nelsonic/github-scraper)
12 | [](https://www.npmjs.com/package/github-scraper)
13 |
18 |
19 |
21 |
23 |
24 |
25 |
26 |
27 | # ⚠️ Disclaimer / Warning!
28 |
29 | This repository/project is intended for
30 | ***Educational Purposes*** **ONLY**.
31 | The project and corresponding NPM module should not
32 | be used for any purpose other than *learning*.
33 | Please do not use it for any other reason
34 | than to learn _about_ DOM parsing
35 | and _definitely_ don't _depend_ on it for anything important!
36 |
37 | The nature of DOM parsing is that when the HTML/UI changes,
38 | the parser will inevitably fail ...
39 | GitHub have every right to change/improve their UI as they see fit.
40 | When they do change their UI the scraper will _inevitably_ "_break_"!
41 | We have [Travis-CI](https://travis-ci.org/nelsonic/github-scraper)
42 | continuous integration to run our tests precisely
43 | to _check_ that parsers for the various pages are working as expected.
44 | You can run the tests locally too,
45 | see
46 | ["Run The Tests"](https://github.com/nelsonic/github-scraper#3-run-the-tests)
47 | section below.
48 |
49 | ## Why?
50 |
51 | Our _initial reason_ for writing this set of scrapers was to satisfy the _curiosity_ / _question_:
52 | > _How_ can we ***discover*** which are the ***interesting people and projects
53 | on GitHub***
54 | (_without **manually** checking *dozens* of GitHub profiles/repositories each day_) ?
55 |
56 | Our _second reason_ for scraping data from GitHub is so that we can show people a "*summary view*" of all their issues in our [Tudo](https://github.com/dwyl/tudo) project (which helps people track/manage/organise/prioritise their GitHub issues).
57 | See: https://github.com/dwyl/tudo/issues/51
58 |
59 | We needed a _simple_ way of systematically getting data from GitHub (_before people authenticate_) and scraping is the only way we could think of.
60 |
61 | We _tried_ using the [GitHub ***API***](https://developer.github.com/v3/)
62 | to get records from GitHub, but sadly,
63 | it has quite a few limitations (see: "_Issues with GitHub API_" section below) the biggest limitation being the [_rate-limiting_](https://developer.github.com/v3/#rate-limiting) on API requests.
64 |
65 | Thirdly we're building this project to [***scratch our own itch***](https://gettingreal.37signals.com/ch02_Whats_Your_Problem.php)
66 | ... scraping the _pages_ of GitHub has given us a _unique_ insight into the features of the platform which has leveled-up our skills.
67 |
68 | > Don't *you* want to know ***what's "Hot" right now on GitHub***...?
69 |
70 |
71 | ## What (*Problem* are we _trying_ to Solve)?
72 |
73 | Having a way of extracting the *essential* data from GitHub
74 | is a solution to a _surprisingly **wide array of problems**_, here are a few:
75 |
76 | + ***Who*** are the up-and-comming people (_worth following_) on GitHub?
77 | + ***Which*** are the ***interesting projects*** (*and why?!*)
78 | + ***What*** is the average age of an issue for a project?
79 | + Is a project's ***popularity growing*** or *plateaued*?
80 | + Are there (_already_) any ***similar projects*** to what I'm trying to build? (_reduce duplication of effort which is rampant in Open Source!!_)
81 | + How many projects get started but never finished?
82 | + ***Will*** my **Pull Request** *ever* get *merged* or is the module maintainer *too busy* and did I just [***waste 3 hours***](https://twitter.com/nelsonic/status/621984170353524736)?
83 | + _insert **your idea/problem** here_ ...
84 | + **Associative Lists** e.g: People who starred `abc` also liked `xyz`
85 |
86 |
87 | # How?
88 |
89 | This module fetches (_public_) pages from GitHub, "[_scrapes_](https://en.wikipedia.org/wiki/Web_scraping)" the html to extract raw data and returns a JSON Object.
90 |
91 | # Usage
92 |
93 | ## install from NPM
94 |
95 | install from npm and save to your `package.json`:
96 |
97 | ```sh
98 | npm install github-scraper --save
99 | ```
100 |
101 | ## Use it in your script!
102 |
103 | ```js
104 | var gs = require('github-scraper');
105 | var url = '/iteles' // a random username
106 | gs(url, function(err, data) {
107 | console.log(data); // or what ever you want to do with the data
108 | })
109 | ```
110 |
111 | ## Example URLs and Output
112 |
113 | ### Profile Page
114 |
115 | User profile has the following format `https://github.com/{username}`
116 | example: [https://github.com/**iteles**](https://github.com/iteles)
117 |
118 | 
119 |
120 |
121 | ```js
122 | var gs = require('github-scraper'); // require the module
123 | var url = 'alanshaw' // a random username (of someone you should follow!)
124 | gs(url, function(err, data) {
125 | console.log(data); // or what ever you want to do with the data
126 | })
127 | ```
128 |
129 | Sample output:
130 |
131 | ```json
132 | {
133 | "type": "profile",
134 | "url": "/iteles",
135 | "avatar": "https://avatars1.githubusercontent.com/u/4185328?s=400&v=4",
136 | "name": "Ines Teles Correia",
137 | "username": "iteles",
138 | "bio": "Co-founder @dwyl | Head cheerleader @foundersandcoders",
139 | "uid": 4185328,
140 | "worksfor": "@dwyl",
141 | "location": "London, UK",
142 | "website": "http://www.twitter.com/iteles",
143 | "orgs": {
144 | "bowlingjs": "https://avatars3.githubusercontent.com/u/8825909?s=70&v=4",
145 | "foundersandcoders": "https://avatars3.githubusercontent.com/u/9970257?s=70&v=4",
146 | "docdis": "https://avatars0.githubusercontent.com/u/10836426?s=70&v=4",
147 | "dwyl": "https://avatars2.githubusercontent.com/u/11708465?s=70&v=4",
148 | "ladiesofcode": "https://avatars0.githubusercontent.com/u/16606192?s=70&v=4",
149 | "TheScienceMuseum": "https://avatars0.githubusercontent.com/u/16609662?s=70&v=4",
150 | "SafeLives": "https://avatars2.githubusercontent.com/u/20841400?s=70&v=4"
151 | },
152 | "repos": 28,
153 | "projects": 0,
154 | "stars": 453,
155 | "followers": 341,
156 | "following": 75,
157 | "pinned": [
158 | { "url": "/dwyl/start-here" },
159 | { "url": "/dwyl/learn-tdd" },
160 | { "url": "/dwyl/learn-elm-architecture-in-javascript" },
161 | { "url": "/dwyl/tachyons-bootstrap" },
162 | { "url": "/dwyl/learn-ab-and-multivariate-testing" },
163 | { "url": "/dwyl/learn-elixir" }
164 | ],
165 | "contribs": 878,
166 | "contrib_matrix": {
167 | "2018-04-08": { "fill": "#c6e48b", "count": 1, "x": "13", "y": "0" },
168 | "2018-04-09": { "fill": "#c6e48b", "count": 2, "x": "13", "y": "12" },
169 | "2018-04-10": { "fill": "#7bc96f", "count": 3, "x": "13", "y": "24" },
170 | ...etc...
171 | "2019-04-11": { "fill": "#c6e48b", "count": 1, "x": "-39", "y": "48" },
172 | "2019-04-12": { "fill": "#7bc96f", "count": 5, "x": "-39", "y": "60"}
173 | }
174 | }
175 | ```
176 |
177 | ### Followers
178 |
179 | How many people are following a given person on Github.
180 | Url format: `https://github.com/{username}/followers`
181 | example: [https://github.com/iteles/**followers**](https://github.com/iteles/followers)
182 |
183 | ```js
184 | var gs = require('github-scraper'); // require the module
185 | var url = 'iteles/followers' // a random username (of someone you should follow!)
186 | gs(url, function(err, data) {
187 | console.log(data); // or what ever you want to do with the data
188 | })
189 | ```
190 |
191 | Sample output:
192 |
193 | ```js
194 | { entries:
195 | [ 'tunnckoCore', 'OguzhanE', 'minaorangina', 'Jasonspd', 'muntasirsyed', 'fmoliveira', 'nofootnotes',
196 | 'SimonLab', 'Danwhy', 'kbocz', 'cusspvz', 'RabeaGleissner', 'beejhuff', 'heron2014', 'joshpitzalis',
197 | 'rub1e', 'nikhilaravi', 'msmichellegar', 'anthonybrown', 'miglen', 'shterev', 'NataliaLKB',
198 | 'ricardofbarros', 'boymanjor', 'asimjaved', 'amilvasishtha', 'Subhan786', 'Neats29', 'lottie-em',
199 | 'rorysedgwick', 'izaakrogan', 'oluoluoxenfree', 'markwilliamfirth', 'bmordan', 'nodeco', 'besarthoxhaj',
200 | 'FilWisher', 'maryams', 'sofer', 'joaquimserafim', 'vs4vijay', 'intool', 'edwardcodes', 'hyprstack',
201 | 'nelsonic' ],
202 | url: 'https://github.com/iteles/followers' }
203 | ok 1 iteles/followers count: 45
204 | ```
205 |
206 | If the person has ***more than 51 followers*** they will have multiple pages of followers.
207 | The data will have a **next_page** key with a value such as:
208 | [/nelsonic/followers?**page=2**](https://github.com/nelsonic/followers?page=2)
209 | If you want to keep fetching these subsequent pages of followers,
210 | simply keep running the scraper:
211 | e.g:
212 |
213 | ```js
214 | var url = 'alanshaw/followers' // a random username (of someone you should follow!)
215 | gs(url, function(err, data) {
216 | console.log(data); // or what ever you want to do with the data
217 | if(data.next_page) {
218 | gs(data.next_page, function(err2, data2) {
219 | console.log(data2); // etc.
220 | })
221 | }
222 | })
223 | ```
224 |
225 | ### **Following**
226 | Want to know the list of people this person is `following` that's *easy* too!
227 | The url format is: `https://github.com/{username}/following`
228 | e.g: [https://github.com/iteles/**following**](https://github.com/iteles/following) or
229 | [https://github.com/nelsonic/following?**page=2**](https://github.com/nelsonic/following?page=2)
230 | (_where the person is following more than 51 people_ ...)
231 |
232 | Usage format is *identical* to `followers` (above) so here's an example
233 | of fetching page 3 of the results:
234 |
235 | ```js
236 | var gs = require('github-scraper'); // require the module
237 | var url = 'nelsonic/following?page=3' // a random dude
238 | gs(url, function(err, data) {
239 | console.log(data); // or what ever you want to do with the data
240 | })
241 | ```
242 |
243 | Sample output:
244 |
245 | ```js
246 | {
247 | entries:
248 | [ 'kytwb', 'dexda', 'arrival', 'jinnjuice', 'slattery', 'unixarcade', 'a-c-m', 'krosti',
249 | 'simonmcmanus', 'jupiter', 'capaj', 'cowenld', 'FilWisher', 'tsop14', 'NataliaLKB',
250 | 'izaakrogan', 'lynnaloo', 'nvcexploder', 'cwaring', 'missinglink', 'alanshaw', 'olizilla',
251 | 'tancredi', 'Ericat', 'pgte' 'hyprstack', 'iteles' ],
252 | url: 'https://github.com/nelsonic/following?page=3',
253 | next_page: 'https://github.com/nelsonic/following?page=4'
254 | }
255 | ```
256 |
257 | ### Starred Repositories
258 |
259 | The list of projects a person has *starred* a fascinating source of insight.
260 | url format: https://github.com/stars/{username}
261 | e.g: [/stars/iteles](https://github.com/stars/iteles)
262 |
263 | ```js
264 | var gs = require('github-scraper'); // require the module
265 | var url = 'stars/iteles'; // starred repos for this user
266 | gs(url, function(err, data) {
267 | console.log(data); // or what ever you want to do with the data
268 | })
269 | ```
270 |
271 | Sample output:
272 |
273 | ```js
274 | {
275 | entries:
276 | [ '/dwyl/repo-badges', '/nelsonic/learn-testling', '/joshpitzalis/testing', '/gmarena/gmarena.github.io',
277 | '/dwyl/alc', '/nikhilaravi/fac5-frontend', '/foundersandcoders/dossier', '/nelsonic/health', '/dwyl/alvo',
278 | '/marmelab/gremlins.js', '/docdis/learn-saucelabs', '/rogerdudler/git-guide', '/tableflip/guvnor',
279 | '/dwyl/learn-redis', '/foundersandcoders/playbook', '/MIJOTHY/FOR_FLUX_SAKE', '/NataliaLKB/learn-git-basics',
280 | '/nelsonic/liso', '/dwyl/learn-json-web-tokens', '/dwyl/hapi-auth-jwt2', '/dwyl/start-here',
281 | '/arvida/emoji-cheat-sheet.com', '/dwyl/time', '/docdis/learn-react', '/dwyl/esta', '/alanshaw/meteor-foam',
282 | '/alanshaw/stylist', '/meteor-velocity/velocity', '/0nn0/terminal-mac-cheatsheet',
283 | '/bowlingjs/bowlingjs.github.io' ],
284 | url: 'https://github.com/stars/iteles?direction=desc&page=2&sort=created',
285 | next_page: 'https://github.com/stars/iteles?direction=desc&page=3&sort=created'
286 | }
287 | ```
288 |
289 | ### Repositories
290 |
291 | The second tab on the personal profile page is "Repositories"
292 | this is a **list** of the ***personal projects*** the person is working on, e.g: https://github.com/iteles?tab=repositories
293 |
294 |
295 |
296 | We crawl this page and return an array containing the repo properties:
297 |
298 | ```js
299 | var url = 'iteles?tab=repositories';
300 | gs(url, function(err, data) {
301 | console.log(data); // or what ever you want to do with the data
302 | })
303 | ```
304 |
305 | sample output:
306 |
307 | ```js
308 | {
309 | entries: [
310 | { url: '/iteles/learn-ab-and-multivariate-testing',
311 | name: 'learn-ab-and-multivariate-testing',
312 | lang: '',
313 | desc: 'Tutorial on A/B and multivariate testing',
314 | info: '',
315 | stars: '4',
316 | forks: '0',
317 | updated: '2015-07-08T08:36:37Z' },
318 | { url: '/iteles/learn-tdd',
319 | name: 'learn-tdd',
320 | lang: 'JavaScript',
321 | desc: 'A brief introduction to Test Driven Development (TDD) in JavaScript',
322 | info: 'forked from dwyl/learn-tdd',
323 | stars: '0',
324 | forks: '4',
325 | updated: '2015-06-29T17:24:56Z' },
326 | { url: '/iteles/practical-full-stack-testing',
327 | name: 'practical-full-stack-testing',
328 | lang: 'HTML',
329 | desc: 'A fork of @nelsonic\'s repo to allow for PRs',
330 | info: 'forked from nelsonic/practical-js-tdd',
331 | stars: '0',
332 | forks: '36',
333 | updated: '2015-06-06T14:40:43Z' },
334 | { url: '/iteles/styling-for-accessibility',
335 | name: 'styling-for-accessibility',
336 | lang: '',
337 | desc: 'A collection of \'do\'s and \'don\'t\'s of CSS to ensure accessibility',
338 | info: '',
339 | stars: '0',
340 | forks: '0',
341 | updated: '2015-05-26T11:06:28Z' },
342 | { url: '/iteles/Ultimate-guide-to-successful-meetups',
343 | name: 'Ultimate-guide-to-successful-meetups',
344 | lang: '',
345 | desc: 'The ultimate guide to organizing successful meetups',
346 | info: '',
347 | stars: '3',
348 | forks: '0',
349 | updated: '2015-05-19T09:40:39Z' },
350 | { url: '/iteles/Javascript-the-Good-Parts-notes',
351 | name: 'Javascript-the-Good-Parts-notes',
352 | lang: '',
353 | desc: 'Notes on the seminal "Javascript the Good Parts: byDouglas Crockford',
354 | info: '',
355 | stars: '41',
356 | forks: '12',
357 | updated: '2015-05-17T16:39:35Z' }
358 | ],
359 | url: 'https://github.com/iteles?tab=repositories' }
360 | ```
361 |
362 |
363 | ### Activity feed
364 |
365 | Every person on GitHub has an RSS feed for their recent activity;
366 | this is the 3rd and final tab of the person's profile page.
367 |
368 | it can be viewed online by visiting:
369 | ```sh
370 | https://github.com/{username}?tab=activity
371 | ```
372 | e.g: [/iteles?tab=activity](https://github.com/iteles?tab=activity)
373 |
374 |
375 | #### Parsing the Feed
376 |
377 | The activity feed is published as an [**.atom**](https://en.wikipedia.org/wiki/RSS)
378 | xml string which contains a list of entries.
379 |
380 | We use [**xml2js**](https://www.npmjs.com/package/xml2js)
381 | (which in turn uses the [**sax**](https://www.npmjs.com/package/sax) xml parser) to parse the xml stream. This results in a object similar to the following example:
382 |
383 | ```js
384 | { '$':
385 | { xmlns: 'http://www.w3.org/2005/Atom',
386 | 'xmlns:media': 'http://search.yahoo.com/mrss/',
387 | 'xml:lang': 'en-US' },
388 | id: [ 'tag:github.com,2008:/iteles' ],
389 | link: [ { '$': [Object] }, { '$': [Object] } ],
390 | title: [ 'iteles’s Activity' ],
391 | updated: [ '2015-07-22T23:31:25Z' ],
392 | entry:
393 | [ { id: [Object],
394 | published: [Object],
395 | updated: [Object],
396 | link: [Object],
397 | title: [Object],
398 | author: [Object],
399 | 'media:thumbnail': [Object],
400 | content: [Object] },
401 | { id: [Object],
402 | published: [Object],
403 | updated: [Object],
404 | link: [Object],
405 | title: [Object],
406 | author: [Object],
407 | 'media:thumbnail': [Object],
408 | content: [Object] }
409 | ]
410 | }
411 | ```
412 | Each call to the atom feed returns the latest 30 enties.
413 | We're showing 2 here for illustration (_so you get the idea..._)
414 |
415 | From this we _extract_ only the relevant info:
416 |
417 | ```sh
418 | '2015-07-22T12:33:14Z alanshaw pushed to master at alanshaw/david-www',
419 | '2015-07-22T12:33:14Z alanshaw created tag v9.4.3 at alanshaw/david-www',
420 | '2015-07-22T09:23:28Z alanshaw closed issue tableflip/i18n-browserify#6',
421 | '2015-07-21T17:08:19Z alanshaw commented on issue alanshaw/david#71',
422 | '2015-07-21T08:24:13Z alanshaw pushed to master at tableflip/score-board',
423 | '2015-07-20T17:49:59Z alanshaw deleted branch refactor-corp-events at tableflip/sow-api-client',
424 | '2015-07-20T17:49:58Z alanshaw pushed to master at tableflip/sow-api-client',
425 | '2015-07-20T17:49:58Z alanshaw merged pull request tableflip/sow-api-client#2',
426 | '2015-07-20T17:49:54Z alanshaw opened pull request tableflip/sow-api-client#2',
427 | '2015-07-18T07:30:36Z alanshaw closed issue alanshaw/md-tokenizer#1',
428 | '2015-07-18T07:30:36Z alanshaw commented on issue alanshaw/md-tokenizer#1',
429 | ```
430 | Instead of _wasting_ (_what will be **Giga**_) ***Bytes*** of space with key:value pairs by storing the entries as JSON, we are storing the activity feed entries as strings in an array.
431 | Each item in the array can be broken down into:
432 | ```sh
433 | {date-time} {username} {action} {link}
434 | ```
435 |
436 | As we can see from this there are several event types:
437 |
438 | + **pushed to master** at
439 | + **created tag** v9.4.3 at
440 | + **opened issue**
441 | + **commented on issue**
442 | + **closed issue**
443 | + **deleted branch**
444 | + **opened pull request**
445 | + **merged pull request**
446 | + **starred** username/repo-name
447 |
448 | For now we are *not* going to parse the event types, we are simply going to store them in our list for later analysis.
449 |
450 | We have a good pointer when its time to start interpreting the data:
451 | https://developer.github.com/v3/activity/events/types/
452 |
453 | One thing worth noting is that RSS feed is ***Not Real-Time*** ...
454 | sadly, it only gets updated periodically so we cannot rely on it to
455 | have the *latest* info.
456 |
457 |
458 | ### Organization
459 |
460 | Organization pages have the following url pattern: `https://github.com/{orgname}`
461 | example: [https://github.com/**dwyl**](https://github.com/dwyl)
462 |
463 | ```js
464 | var url = 'dwyl';
465 | gs(url, function(err, data) {
466 | console.log(data); // or do something way more interesting with the data!
467 | });
468 | ```
469 |
470 | sample data (`entries` _truncated for brevity_):
471 | ```js
472 | {
473 | entries:
474 | [ { name: 'hapi-auth-jwt2',
475 | desc: 'Secure Hapi.js authentication plugin using JSON Web Tokens (JWT)',
476 | updated: '2015-08-04T19:30:50Z',
477 | lang: 'JavaScript',
478 | stars: '59',
479 | forks: '11' },
480 | { name: 'start-here',
481 | desc: 'A Quick-start Guide for People who want to DWYL',
482 | updated: '2015-08-03T11:04:14Z',
483 | lang: 'HTML',
484 | stars: '14',
485 | forks: '9' },
486 | { name: 'summer-2015',
487 | desc: 'Probably the best Summer Sun, Fun & Coding Experience in the World!',
488 | updated: '2015-07-31T11:02:29Z',
489 | lang: 'CSS',
490 | stars: '16',
491 | forks: '1' },
492 | ],
493 | website: 'http://dwyl.io',
494 | url: 'https://github.com/dwyl',
495 | name: 'dwyl - do what you love',
496 | desc: 'Start here: https://github.com/dwyl/start-here',
497 | location: 'Your Pocket',
498 | email: 'github@dwyl.io',
499 | pcount: 24,
500 | avatar: 'https://avatars3.githubusercontent.com/u/11708465?v=3&s=200',
501 | next_page: '/dwyl?page=2'
502 | }
503 | ```
504 | Note #1: *sadly*, this has the ***identical*** url format to *Profile*
505 | this gets handled by the `switcher` which infers what is an org vs. profile page
506 | by checking for an known element on the page.
507 |
508 | Note #2: when an organization has *multiple pages* of repositories you will see a `next_page`
509 | key/value in the `data` e.g: [/dwyl?**page=2**](/dwyl?page=2) (for the second page of repos)
510 |
511 |
512 | ### Repository Stats
513 |
514 | This is where things start getting interesting ...
515 |
516 | 
517 |
518 | example: https://github.com/nelsonic/adoro
519 |
520 | ```js
521 | var url = 'nelsonic/adoro';
522 | gs(url, function(err, data) {
523 | console.log(data); // or do something way more interesting with the data!
524 | });
525 | ```
526 |
527 | sample data:
528 |
529 | ```js
530 | {
531 | url: 'https://github.com/nelsonic/adoro',
532 | desc: 'The little publishing tool you\'ll love using. [work-in-progress]',
533 | website: 'http://www.dwyl.io/',
534 | watchers: 3,
535 | stars: 8,
536 | forks: 1,
537 | commits: 12,
538 | branches: 1,
539 | releases: 1,
540 | langs: [ 'JavaScript 90.7%', 'CSS 9.3%' ]
541 | }
542 | ```
543 |
544 | > Annoyingly the number of issues and pull requests, contributors and issues
545 | are only rendered *after* the page has loaded (via XHR) so we do not get
546 | these three stats on page load.
547 |
548 |
549 | ### 7. Issues
550 |
551 | Clicking on the issues icon/link in any repository takes us to the list of all the issues.
552 |
553 | Visiting a project with more than a page worth of issues has pagination at the bottom of the page:
554 |
555 | 
556 |
557 | Which has a link to: https://github.com/dwyl/tudo/issues?page=2&q=is%3Aissue+is%3Aopen
558 |
559 | 
560 |
561 | List of issues for a repository:
562 |
563 | ```js
564 | var gs = require('github-scraper');
565 | var url = '/dwyl/tudo/issues';
566 | gs(url, function (err, data) {
567 | console.log(data); // use the data how ever you like
568 | });
569 | ```
570 |
571 | sample output:
572 |
573 | ```sh
574 | { entries:
575 | [
576 | {
577 | url: '/dwyl/tudo/issues/46',
578 | title: 'discuss components',
579 | created: '2015-07-21T15:34:22Z',
580 | author: 'benjaminlees',
581 | comments: 3,
582 | assignee: 'izaakrogan',
583 | milestone: 'I don\'t know what I\'m doing',
584 | labels: [ 'enhancement', 'help wanted', 'question' ]
585 | },
586 | {
587 | url: '/dwyl/tudo/issues/45',
588 | title: 'Create riot components from HTML structure files',
589 | created: '2015-07-21T15:24:58Z',
590 | author: 'msmichellegar',
591 | comments: 2,
592 | assignee: 'msmichellegar',
593 | labels: [ 'question' ]
594 | }
595 | ], // truncated for brevity
596 | open: 30,
597 | closed: 20,
598 | next: '/dwyl/tudo/issues?page=2&q=is%3Aissue+is%3Aopen',
599 | url: '/dwyl/tudo/issues'
600 | }
601 | ```
602 |
603 | Each issue in the list would create a entry in the crawler (worker) queue:
604 |
605 | ```sh
606 | 2015-07-22T12:33:14Z issue /dwyl/tudo/issues/77
607 | ```
608 |
609 | > Should we include the "all issues by this author" link?
610 | + **created_by** https://github.com/dwyl/tudo/issues/created_by/iteles
611 | + **assignee** (assigned to): https://github.com/dwyl/tudo/issues?q=assignee%3Aiteles+is%3Aopen
612 |
613 |
614 | ### Issue (_individual_)
615 |
616 | The result of scraping https://github.com/dwyl/tudo/issues/51
617 |
618 | ```js
619 | var gs = require('github-scraper');
620 | var url = '/dwyl/tudo/issues/51';
621 | gs(url, function (err, data) {
622 | console.log(data); // use the data how ever you like
623 | });
624 | ```
625 |
626 | sample output:
627 |
628 | ```js
629 | { entries:
630 | [ { id: 'issue-96442793',
631 | author: 'nelsonic',
632 | created: '2015-07-22T00:00:45Z',
633 | body: 'instead of waiting for people to perform the steps to authorise Tudo (to access their GitHub orgs/issues we could request their GitHub username on the login page and initiate the retrieval of their issues while they are authenticating... That way, by the time they get back to Tudo their issues dashboard is already pre-rendered and loaded! This is a wow-factor people won\'t be expecting and thus our app immediately delivers on our first promise!\n\nThoughts?' },
634 | { id: 'issuecomment-123807796',
635 | author: 'iteles',
636 | created: '2015-07-22T17:54:12Z',
637 | body: 'I\'d love to test this out, this will be an amazing selling point if we can get the performance to work like we expect!' },
638 | { id: 'issuecomment-124048121',
639 | author: 'nelsonic',
640 | created: '2015-07-23T10:20:15Z',
641 | body: '@iteles have you watched the Foundation Episode featuring Kevin Systrom (instagram) ?\n\n\nhttps://www.youtube.com/watch?v=nld8B9l1aRE\n\n\nWhat were the USPs that contributed to instagram\'s success (considering how many photo-related-apps were in the app store at the time) ?\n\ncc: @besarthoxhaj' },
642 | { id: 'issuecomment-124075792',
643 | author: 'besarthoxhaj',
644 | created: '2015-07-23T11:59:31Z',
645 | body: '@nelsonic love the idea! Let\'s do it!' } ],
646 | labels: [ 'enhancement', 'help wanted', 'question' ],
647 | participants: [ 'nelsonic', 'iteles', 'besarthoxhaj' ],
648 | url: '/dwyl/tudo/issues/51',
649 | title: 'Pre-fetch people\'s issues while they are authenticating with GitHub',
650 | state: 'Open',
651 | author: 'nelsonic',
652 | created: '2015-07-22T00:00:45Z',
653 | milestone: 'Minimal Usable Product',
654 | assignee: 'besarthoxhaj' }
655 | ```
656 |
657 | By contrast using the GitHub API to fetch this issue
658 | see: https://developer.github.com/v3/issues/#get-a-single-issue
659 |
660 | format:
661 | ```sh
662 | /repos/:owner/:repo/issues/:number
663 | ```
664 |
665 | ```sh
666 | curl https://api.github.com/repos/dwyl/tudo/issues/51
667 | ```
668 |
669 | ### Milestones
670 |
671 | Milestones are used to group issues into logical units.
672 |
673 | 
674 |
675 | ```js
676 | var gs = require('github-scraper');
677 | var url = '/dwyl/tudo/milestones';
678 | gs(url, function (err, data) {
679 | console.log(data); // use the data how ever you like
680 | });
681 | ```
682 |
683 | Sample output:
684 |
685 | ```js
686 | { entries:
687 | [ { name: 'Test Milestone - Please Don\'t Close!',
688 | due: 'Past due by 16 days',
689 | updated: 'Last updated 5 days ago',
690 | desc: 'This Milestone in used in our e2e tests to check for an over-due milestone, so please don\'t close it!',
691 | progress: '0%',
692 | open: 1,
693 | closed: 0 },
694 | { name: 'Minimal Usable Product',
695 | due: 'Due by July 5, 2016',
696 | updated: 'Last updated 2 days ago',
697 | desc: 'What is the absolute minimum we can do to deliver value to people using the app?\n(and thus make them want to come back and use it!)',
698 | progress: '0%',
699 | open: 5,
700 | closed: 0 } ],
701 | url: 'https://github.com/dwyl/tudo/milestones',
702 | open: 2,
703 | closed: 1 }
704 | ```
705 |
706 | ### Labels (for a repository)
707 |
708 | All repositories have a set of standard labels (built-in to GitHub)
709 | e.g: https://github.com/dwyl/tudo/labels is (_currently_) only using the "*standard*" labels.
710 |
711 |
712 |
713 | Whereas the RethinkDB (which uses GitHub for all their project tracking) uses _several **custom labels**_:
714 | https://github.com/rethinkdb/rethinkdb/labels
715 |
716 |
717 |
718 | We need to crawl these for each repo.
719 |
720 | ```js
721 | var gs = require('github-scraper');
722 | var url = '/dwyl/time/labels';
723 | gs(url, function (err, data) {
724 | console.log(data); // use the data how ever you like
725 | });
726 | ```
727 |
728 | Here's the extraction of the standard labels:
729 | ```js
730 | [
731 | { name: 'bug',
732 | style: 'background-color: #fc2929; color: #fff;',
733 | link: '/dwyl/tudo/labels/bug',
734 | count: 3 },
735 | { name: 'duplicate',
736 | style: 'background-color: #cccccc; color: #333333;',
737 | link: '/dwyl/tudo/labels/duplicate',
738 | count: 0 },
739 | { name: 'enhancement',
740 | style: 'background-color: #84b6eb; color: #1c2733;',
741 | link: '/dwyl/tudo/labels/enhancement',
742 | count: 11 },
743 | { name: 'help wanted',
744 | style: 'background-color: #159818; color: #fff;',
745 | link: '/dwyl/tudo/labels/help%20wanted',
746 | count: 21 },
747 | { name: 'invalid',
748 | style: 'background-color: #e6e6e6; color: #333333;',
749 | link: '/dwyl/tudo/labels/invalid',
750 | count: 1 },
751 | { name: 'question',
752 | style: 'background-color: #cc317c; color: #fff;',
753 | link: '/dwyl/tudo/labels/question',
754 | count: 10 }
755 | ]
756 | ```
757 |
758 | or a repo that has ***custom labels***:
759 |
760 | ```js
761 | { entries:
762 | [ { name: '[alpha]',
763 | style: 'background-color: #79CDCD; color: #1e3333;',
764 | link: '/dwyl/time/labels/%5Balpha%5D',
765 | count: 2 },
766 | { name: 'API',
767 | style: 'background-color: #006b75; color: #fff;',
768 | link: '/dwyl/time/labels/API',
769 | count: 11 },
770 | { name: 'bug',
771 | style: 'background-color: #fc2929; color: #fff;',
772 | link: '/dwyl/time/labels/bug',
773 | count: 5 },
774 | { name: 'chore',
775 | style: 'background-color: #e11d21; color: #fff;',
776 | link: '/dwyl/time/labels/chore',
777 | count: 9 },
778 | { name: 'discuss',
779 | style: 'background-color: #bfe5bf; color: #2a332a;',
780 | link: '/dwyl/time/labels/discuss',
781 | count: 43 },
782 | { name: 'Documentation',
783 | style: 'background-color: #eb6420; color: #fff;',
784 | link: '/dwyl/time/labels/Documentation',
785 | count: 2 },
786 | { name: 'duplicate',
787 | style: 'background-color: #cccccc; color: #333333;',
788 | link: '/dwyl/time/labels/duplicate',
789 | count: 0 },
790 | { name: 'enhancement',
791 | style: 'background-color: #84b6eb; color: #1c2733;',
792 | link: '/dwyl/time/labels/enhancement',
793 | count: 27 },
794 | { name: 'external dependency',
795 | style: 'background-color: #D1EEEE; color: #2c3333;',
796 | link: '/dwyl/time/labels/external%20dependency',
797 | count: 1 },
798 | { name: 'FrontEnd',
799 | style: 'background-color: #f7c6c7; color: #332829;',
800 | link: '/dwyl/time/labels/FrontEnd',
801 | count: 26 },
802 | { name: 'help wanted',
803 | style: 'background-color: #009800; color: #fff;',
804 | link: '/dwyl/time/labels/help%20wanted',
805 | count: 42 },
806 | { name: 'invalid',
807 | style: 'background-color: #e6e6e6; color: #333333;',
808 | link: '/dwyl/time/labels/invalid',
809 | count: 0 },
810 | { name: 'investigate',
811 | style: 'background-color: #fbca04; color: #332900;',
812 | link: '/dwyl/time/labels/investigate',
813 | count: 18 },
814 | { name: 'MVP',
815 | style: 'background-color: #207de5; color: #fff;',
816 | link: '/dwyl/time/labels/MVP',
817 | count: 27 },
818 | { name: 'NiceToHave',
819 | style: 'background-color: #fbca04; color: #332900;',
820 | link: '/dwyl/time/labels/NiceToHave',
821 | count: 7 },
822 | { name: 'Post MVP',
823 | style: 'background-color: #fef2c0; color: #333026;',
824 | link: '/dwyl/time/labels/Post%20MVP',
825 | count: 24 },
826 | { name: 'question',
827 | style: 'background-color: #cc317c; color: #fff;',
828 | link: '/dwyl/time/labels/question',
829 | count: 25 },
830 | { name: 'UI',
831 | style: 'background-color: #bfdadc; color: #2c3233;',
832 | link: '/dwyl/time/labels/UI',
833 | count: 13 } ],
834 | url: 'https://github.com/dwyl/time/labels' }
835 | ```
836 |
837 | ### Issues > *Search* (*Bonus Feature*)
838 |
839 | A ***much*** more *effective* way of collating all the issues relevant to a person is to search for them!
840 |
841 | example:
842 | https://github.com/search?type=Issues&q=author%3Aiteles&state=open&o=desc&s=created
843 |
844 | ```js
845 | {
846 | entries:
847 | [
848 | { title: 'Remove flexbox from CSS',
849 | url: '/dwyl/dwyl.github.io/issues/29',
850 | desc: 'To ensure the site works across all devices, particularly Kindle/e-readers.',
851 | author: 'iteles',
852 | created: '2015-07-25T22:57:20Z',
853 | comments: 2 },
854 | { title: 'CSS | Add indentation back into main.css (disappeared from master)',
855 | url: '/dwyl/tudo/issues/77',
856 | desc: 'All indentation has been removed from main.css in the latest commit. \n\nThis needs to be put back in as originally written by @msmichellegar and @iteles.',
857 | author: 'iteles',
858 | created: '2015-07-25T16:27:59Z' },
859 | { title: 'CSS | Investigate styling of issue label colours',
860 | url: '/dwyl/tudo/issues/72',
861 | desc: 'Labels can be given any colour so there is no predictable set that we can code into the CSS file.\n\nWe need to investigate what the best way to ensure we can provide the right colour of background to the ...',
862 | author: 'iteles',
863 | created: '2015-07-23T17:49:02Z',
864 | comments: 4 }
865 | ],
866 | next: '/search?o=desc&p=2&q=author%3Aiteles&s=created&state=open&type=Issues'
867 | }
868 | ```
869 |
870 |
871 | #### Owner
872 |
873 | For the issues created across all their *personal* repositories
874 | use a search query of the form:
875 | ```sh
876 | https://github.com/search?q=user%3A{username|org}
877 | &state={state}
878 | &type=Issues&s={relevance}
879 | &o={order}
880 | ```
881 | e.g:
882 | https://github.com/search?q=user%3Aiteles&state=open&type=Issues&s=updated&o=asc
883 |
884 | #### Author (_created by_)
885 |
886 | Or to find ***all*** the issues where the person is the ***author***
887 | use a query of the following format:
888 |
889 | ```sh
890 | https://github.com/search?q=author%3A{username|org}
891 | &state={state}
892 | &type=Issues&s={relevance}
893 | &o={order}
894 | ```
895 |
896 | #### Assignee (_issues assigned to this person_)
897 |
898 | Or to find ***all*** the issues *assigned* to the person use a query of the following format:
899 |
900 | ```sh
901 | https://github.com/search?q=assignee%3A{username|org}
902 | &state={state}
903 | &type=Issues&s={relevance}
904 | &o={order}
905 | &s={filter}
906 | ```
907 |
908 | #### Mentions
909 |
910 | We can use a ***mentions*** (search) query to discover all the
911 | issues where a given person (_username_) was mentioned:
912 |
913 | ```sh
914 | https://github.com/search?q=mentions%3A{username}&type=Issues&state={state}
915 | ```
916 |
917 | e.g: https://github.com/search?q=mentions%3Aiteles&type=Issues&state=open
918 |
919 | This _could_ be more than the issues in the person's (_own_) repos *or* the repos the person has access to (_via org_). e.g:
920 | if [_Sally_](http://www.imdb.com/title/tt1483013/quotes?item=qt1905812)
921 | axks a clarifying question on a project she has not yet contributed to,
922 | the issue will not appear when we crawl the repos on her profile or orgs she has access to ...
923 |
924 | #### Issues Filters
925 |
926 | There are *many* filters we can use to find issues, here are a few:
927 |
928 | + **created** https://github.com/search?q=author%3Aiteles&s=created&type=Issues&o=desc&state=open
929 | + **updated**: https://github.com/search?q=author%3Aiteles&s=updated&type=Issues&o=desc&state=open
930 | + **date range**: https://github.com/dwyl/time/issues?q=is%3Aissue+is%3Aopen+updated%3A%3C2015-06-28
931 |
932 | ##### Further Reading on Searching+Filters
933 |
934 | For *way* more details on searching & filters see:
935 |
936 | + https://help.github.com/articles/searching-issues/
937 | + https://help.github.com/articles/searching-github/#types-of-searches
938 | + https://help.github.com/articles/search-syntax/
939 |
940 |
941 |
942 |
943 | ## Want More Examples?
944 |
945 | If you want ***even more*** examples of the pages you can scrape,
946 | take a look at our end-to-end tests where we *test* all the scrapers!
947 |
948 |
949 |
950 | ## Future Features / Road Map ?
951 |
952 |
953 | ### Crawl the List of commits
954 |
955 | Would it be interesting to see/track:
956 | + **who** makes the most commits to the project
957 | + **when** (***what time*** of day/night) people do their work
958 | + **what** did the person contribute? (docs, code improvement, tests, typo, dependency update?)
959 |
960 | Show your interest in this feature: https://github.com/nelsonic/github-scraper/issues/17
961 |
962 |
963 |
964 | # Contributing?
965 |
966 | Contributions are _always_ welcome!
967 | We have a backlog of features (_many pages we want to parse_)
968 | please see: https://github.com/nelsonic/github-scraper/issues
969 | If anything interests you, please lave a comment on the issue.
970 |
971 | Your first step to _contributing_ to this project
972 | is to run it on your **`localhost`**.
973 |
974 | ### 1. Clone the Repository
975 |
976 | In your terminal, clone the repository from GitHub:
977 |
978 | ```sh
979 | git clone https://github.com/nelsonic/github-scraper.git && cd github-scraper
980 | ```
981 |
982 | ### 2. Install the Dependencies
983 |
984 | Ensure you have Node.js installed, see https://nodejs.org
985 | Then run the following command to install the project dependencies:
986 |
987 | ```sh
988 | npm install
989 | ```
990 |
991 | You should see output in your terminal similar to the following:
992 |
993 | ```
994 | added 162 packages from 177 contributors and audited 265 packages in 4.121s
995 | ```
996 |
997 | That tells you that the dependencies were successfully installed.
998 |
999 |
1000 | ### 3. Run the Tests
1001 |
1002 | In your terminal execute the following command:
1003 |
1004 | ```sh
1005 | npm test
1006 | ```
1007 |
1008 |
1009 | You should see output similar to the following:
1010 |
1011 | ```
1012 | > github-scraper@6.7.1 test /Users/n/code/github-scraper
1013 | > istanbul cover ./node_modules/tape/bin/tape ./test/*.js | node_modules/tap-spec/bin/cmd.js
1014 |
1015 |
1016 | read list of followers for @jupiter (single page of followers)
1017 |
1018 | - - - GitHub Scraper >> /jupiter/followers >> followers - - -
1019 | ✔ jupiter/followers data.type: followers
1020 | ✔ @jupiter/followers has 34 followers
1021 | ✔ Nelson in jupiter/followers
1022 | ✔ @jupiter/followers only has 1 page of followers
1023 |
1024 | read list of followers for @iteles (multi-page)
1025 |
1026 | - - - GitHub Scraper >> /iteles/followers >> followers - - -
1027 | ✔ "followers": 51 on page 1
1028 | ✔ iteles/followers multi-page followers
1029 |
1030 |
1031 | ... etc ...
1032 |
1033 | =============================================================================
1034 | Writing coverage object [/Users/n/code/github-scraper/coverage/coverage.json]
1035 | Writing coverage reports at [/Users/n/code/github-scraper/coverage]
1036 | =============================================================================
1037 | =============================== Coverage summary ===============================
1038 | Statements : 100% ( 192/192 )
1039 | Branches : 100% ( 63/63 )
1040 | Functions : 100% ( 22/22 )
1041 | Lines : 100% ( 192/192 )
1042 | ================================================================================
1043 |
1044 |
1045 | total: 102
1046 | passing: 102
1047 | duration: 31.6s
1048 | ```
1049 |
1050 | The tests take around 30 seconds to run on _my_ `localhost`,
1051 | but your test execution time will vary depending on your location
1052 | (_the further you are from GitHub's servers the slower the tests will run..._).
1053 |
1054 | Don't panic if you see some red in your terminal while the tests are running.
1055 | We have to simulate failure `404` and `403` errors
1056 | to ensure that we can handle them.
1057 | Pages some times disappear
1058 | e.g: a user leaves GitHub or deletes a project.
1059 | And our script needs to not freak out when that happens.
1060 | This is good practice in DOM parsing, the web changes a _lot_!
1061 |
1062 | When the tests _pass_ on your `localhost`,
1063 | you know everything is working as expected.
1064 | Time to move on to the fun bit!
1065 |
1066 | > **Note**: This project follows Test Driven Development (TDD)
1067 | because it's the only way we can maintain our sanity ...
1068 | If we didn't have tests it would be _chaos_
1069 | and _everything_ would "break" all the time.
1070 | If you are contributing to the project,
1071 | please be aware that tests are required
1072 | and any Pull Requests without tests will not be considered.
1073 | (_please don't take it personally, it's just a rule we have_).
1074 |
1075 | If you are new to TDD, please see:
1076 | [github.com/dwyl/**learn-tdd**](https://github.com/dwyl/learn-tdd)
1077 |
1078 |
1079 |
1080 | ### 4. Pick an Issue and Write Some Code!
1081 |
1082 | Once you have the project running on your `localhost`,
1083 | it's time to pick a page to parse!
1084 |
1085 | There are a bunch of features in the backlog. see:
1086 | https://github.com/nelsonic/github-scraper/issues
1087 |
1088 | Pick one that interests you
1089 | and write a comment on it
1090 | to _show_ your interest in contributing.
1091 |
1092 |
1093 | ### Travis-CI?
1094 |
1095 | We use Travis-CI (Continuous Integration),
1096 | to ensure that our code works
1097 | and all tests _pass_ whenever a change is made to the code.
1098 | This is _essential_ in _any_ project and even more so in a DOM parsing one.
1099 |
1100 | If you are new to Travis-CI, please see:
1101 | [github.com/dwyl/**learn-travis**](https://github.com/dwyl/learn-travis)
1102 |
1103 | ### Pre-Commit Hook?
1104 |
1105 | When you attempt to commit code on your `localhost`,
1106 | the tests will run **`before`** your commit will register.
1107 | This is a precaution to ensure that the code we write is _always tested_.
1108 | There is no point writing code that is not being tested
1109 | as it will "break" almost immediately and be unmaintainable.
1110 |
1111 | Simply wait a few seconds for the tests to pass
1112 | and then push your work to GitHub.
1113 |
1114 | If you are new to pre-commit hooks, please see:
1115 | [github.com/dwyl/**learn-pre-commit**](https://github.com/dwyl/learn-pre-commit)
1116 |
1117 |
1118 |
1119 |
1120 | ## tl;dr
1121 |
1122 | If you are the kind of person that likes to *understand* how something works,
1123 | this is *your* section.
1124 |
1125 | ### Inferring Which Scraper to use from the URL
1126 |
1127 | `lib/switcher.js` handles inference.
1128 | We wanted to use a `switch > case` construct but, ended up using `if/else`
1129 | because there are two types of checks we need to do so `if/else` seemed simpler.
1130 |
1131 |
1132 | ## Interesting Facts
1133 |
1134 | - GitHub has 10.3 Million users (_at last count_)
1135 | - yet the most followed person [Linus Torvalds](https://github.com/torvalds)
1136 | "_only_" has **28k followers** (_so its a **highly distributed network**_ )
1137 | + https://www.githubarchive.org/ attempts to archive all of GitHub
1138 | + http://octoboard.com/ shows stats for the past 24h
1139 |
1140 |
1141 | ## Research
1142 |
1143 | > Must read up about http://en.wikipedia.org/wiki/Inverted_index
1144 | > so I understand how to use: https://www.npmjs.org/package/level-inverted-index
1145 |
1146 | - GitHub stats (node module): https://github.com/apiengine/ghstats
1147 | (no tests or recent work/activity, but interesting functionality)
1148 |
1149 | - Hard Drive reliability stats:
1150 | https://www.backblaze.com/blog/hard-drive-reliability-update-september-2014
1151 | (useful when selecting which drives to use in the storage array -
1152 | Clear Winner is Hitachi 3TB)
1153 | - RAID explained in layman's terms:
1154 | http://uk.pcmag.com/storage-devices-reviews/7917/feature/raid-levels-explained
1155 | - RAID Calculator:
1156 | https://www.synology.com/en-global/support/RAID_calculator
1157 | (if you don't already know how much space you get)
1158 | - SQLite limits: https://www.sqlite.org/limits.html
1159 |
1160 | ## Useful Links
1161 |
1162 | - Summary of ***Most Active*** GitHub users: http://git.io/top
1163 | - Intro to web-scraping with cheerio:
1164 | https://www.digitalocean.com/community/tutorials/how-to-use-node-js-request-and-cheerio-to-set-up-simple-web-scraping
1165 | - GitHub background info: http://en.wikipedia.org/wiki/GitHub
1166 | + GitHub Event Types:
1167 | https://developer.github.com/v3/activity/events/types/
1168 |
1169 | ### GitHub Stats API
1170 |
1171 | - Github Stats API: https://developer.github.com/v3/repos/statistics/
1172 | - GitHub Followers API: https://developer.github.com/v3/users/followers/
1173 |
1174 | Example:
1175 |
1176 | ```sh
1177 | curl -v https://api.github.com/users/pgte/followers
1178 | ```
1179 |
1180 | ```js
1181 | [
1182 | {
1183 | "login": "methodmissing",
1184 | "id": 379,
1185 | "avatar_url": "https://avatars.githubusercontent.com/u/379?v=2",
1186 | "gravatar_id": "",
1187 | "url": "https://api.github.com/users/methodmissing",
1188 | "html_url": "https://github.com/methodmissing",
1189 | "followers_url": "https://api.github.com/users/methodmissing/followers",
1190 | "following_url": "https://api.github.com/users/methodmissing/following{/other_user}",
1191 | "gists_url": "https://api.github.com/users/methodmissing/gists{/gist_id}",
1192 | "starred_url": "https://api.github.com/users/methodmissing/starred{/owner}{/repo}",
1193 | "subscriptions_url": "https://api.github.com/users/methodmissing/subscriptions",
1194 | "organizations_url": "https://api.github.com/users/methodmissing/orgs",
1195 | "repos_url": "https://api.github.com/users/methodmissing/repos",
1196 | "events_url": "https://api.github.com/users/methodmissing/events{/privacy}",
1197 | "received_events_url": "https://api.github.com/users/methodmissing/received_events",
1198 | "type": "User",
1199 | "site_admin": false
1200 | },
1201 |
1202 | etc...]
1203 | ```
1204 |
1205 | #### Issues (with using the) GitHub API:
1206 |
1207 | - The API only returns 30 results per query.
1208 | - **X-RateLimit-Limit**: **60** (can only make 60 requests per hour) ...
1209 | 1440 queries per day (60 per hour x 24 hours) sounds like *ample* on the surface.
1210 | But, if we assume the average person has at least 2 pages worth of followers (30<)
1211 | it means on a single instance/server we can only track 720 people.
1212 | Not really enough to do any sort of trend analysis. :disappointed:
1213 | If we are tracking people with hundreds of followers (and *growing fast*)
1214 | e.g. 300< followers. the number of users we can track comes down to
1215 | 1440 / 10 = 140 people...
1216 | (10 requests to fetch complete list of followers) we burn through 1440 requests
1217 | pretty quickly.
1218 | - There's no guarantee which order the followers will be in
1219 | (e.g. most recent first?)
1220 | - **Results** are ***Cached*** so they are not-real time like they are in the
1221 | Web. (seems daft, but its true.) Ideally they would have a ***Streaming API***
1222 | but sadly, [GitHub is built in Ruby-on-Rails](http://builtwith.com/github.com)
1223 | which is "**RESTful**" (***not real-time***).
1224 |
1225 | #### *But*...
1226 |
1227 | Once we know _who_ we *should* be following, we can use
1228 |
1229 | - https://developer.github.com/v3/users/followers/#follow-a-user
1230 | - https://developer.github.com/v3/users/followers/#check-if-one-user-follows-another
1231 |
1232 | e.g:
1233 | ```sh
1234 | curl -v https://api.github.com/users/pgte/following/visionmedia
1235 | ```
1236 |
1237 |
1238 |
1239 | # FAQ?
1240 |
1241 | ## Is *Crawling* a Website *Legal*...?
1242 |
1243 | The fact that scraping or "crawling" is Google's Business Model suggests that scraping is at least "OK" ...
1244 |
1245 | Started typing this into google and saw:
1246 |
1247 |
1248 | I read a few articles and was not able to locate a definitive answer ...
1249 |
1250 | + Legal Issues: https://en.wikipedia.org/wiki/Web_scraping#Legal_issues
1251 | + It depends: http://resources.distilnetworks.com/h/i/53822104-is-web-scraping-illegal-depends-on-what-the-meaning-of-the-word-is-is/181642
1252 | + Screen scraping: How to profit from your rival's data:
1253 | http://www.bbc.com/news/technology-23988890
1254 | + Web Scraping For Fun and Profit: https://blog.hartleybrody.com/web-scraping/
1255 |
--------------------------------------------------------------------------------
/config/repos.js:
--------------------------------------------------------------------------------
1 |
2 |
3 | const SELECTORS={
4 | COMMIT:".Box-header--blue strong",
5 | LANGUAGES:".BorderGrid--spacious .BorderGrid-row",
6 | FORKED_FROM:'a[data-hovercard-type="repository"]',
7 | FOLLOWERS:'.Layout-main .d-table',
8 | TOPIC_TAG:".topic-tag",
9 | PROFILE:'div[itemtype="http://schema.org/Person"]'
10 | }
11 |
12 | module.exports = SELECTORS;
--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | dwyl
2 | *.json
3 |
--------------------------------------------------------------------------------
/examples/data/___next_page.txt:
--------------------------------------------------------------------------------
1 | /dwyl?page=2
2 | https://github.com/dwyl/aws-lambda-deploy/stargazers?after=Y3Vyc29yOnYyOpO0MjAxNi0wOC0yOVQwNDo0ODozNloAzgP5isg%3D
3 | https://github.com/dwyl/learn-nightwatch/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0wOC0yMFQxNjoyOTowMFoAzgrqe7Q%3D
4 | https://github.com/dwyl/english-words/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMy0yOFQyMToyOTozOVoAzgymXrc%3D
5 | https://github.com/dwyl/learn-elm/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMS0yOFQxMDozMzozOVoAzgwqTrg%3D
6 | /dwyl/learn-to-send-email-via-google-script-html-no-server/watchers?page=2
7 | https://github.com/dwyl/phoenix-liveview-counter-tutorial/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0wNi0wN1QyMDo1Mjo1MFoAzgpVseM%3D
8 | https://github.com/dwyl/learn-aws-lambda/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMi0wOFQxMTo1MTo0OVoAzgvJ_MM%3D
9 | https://github.com/dwyl/aws-sdk-mock/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMy0wMlQwOTo1NTozMVoAzgxtXqE%3D
10 | /dwyl/learn-elm/watchers?page=2
11 | https://github.com/dwyl/hapi-auth-jwt2/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMS0yMFQxMjowNDo0NVoAzgulaBM%3D
12 | https://github.com/dwyl/phoenix-ecto-encryption-example/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMC0xNlQxMTo0MDoyNFoAzgtdNa8%3D
13 | https://github.com/dwyl/learn-to-send-email-via-google-script-html-no-server/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMy0xNVQwMDoxNzo0MVoAzgyIV5U%3D
14 | https://github.com/dwyl/learn-phoenix-framework/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMS0yMlQwMzowMDo0MloAzgupDGk%3D
15 | https://github.com/dwyl/phoenix-chat-example/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMS0wM1QwMzozMTowMloAzgv6dNQ%3D
16 | /dwyl/english-words/watchers?page=2
17 | /dwyl/learn-aws-lambda/watchers?page=2
18 | /dwyl/learn-nightwatch/watchers?page=2
19 |
--------------------------------------------------------------------------------
/examples/get_profile.js:
--------------------------------------------------------------------------------
1 | const fs = require("fs")
2 | const gs = require("../lib/switcher");
3 | const url = "andrew" // "iteles" // a random username
4 | gs(url, function(err, data) {
5 |
6 | fs.writeFileSync(__dirname + "/" + url + ".json", JSON.stringify(data, null, 2))
7 | console.log(data); // or what ever you want to do with the data
8 | })
9 |
--------------------------------------------------------------------------------
/examples/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
89 |
90 |