├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── README.md ├── config └── repos.js ├── examples ├── .gitignore ├── data │ └── ___next_page.txt ├── get_profile.js ├── index.html ├── list-repos.js └── stars-recursive-scrape-save.js ├── index.js ├── lambda ├── debug.js ├── http_request.js └── s3.js ├── lib ├── feed.js ├── followers.js ├── http_request.js ├── index.js ├── issue.js ├── issues.js ├── issues_search.js ├── labels.js ├── milestones.js ├── next_page.js ├── next_page_beta.js ├── org.js ├── org_repos.js ├── people.js ├── profile.js ├── profile_contribs.js ├── repo.js ├── repos.js ├── repos_user.js ├── scrapers.js ├── starred.js ├── stars_watchers.js ├── switcher.js ├── url_validator.js └── utils.js ├── package-lock.json ├── package.json └── test ├── e2e.test.js ├── feed.test.js ├── fixtures ├── dwyl-tudo-issue-51-api-comments.json ├── dwyl-tudo-issue-51-api.json ├── dwyl-tudo-issue-51-scrape.json └── dwyl-tudo-issue-51.html ├── followers.test.js ├── following.test.js ├── http_request.test.js ├── issue.test.js ├── issues.test.js ├── issues_search.test.js ├── labels.test.js ├── milestones.test.js ├── org.test.js ├── people.test.js ├── profile.test.js ├── repo.test.js ├── repos.test.js ├── starred.test.js ├── stars.test.js ├── switcher.test.js ├── url_validator.test.js └── utils.test.js /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: npm 4 | directory: "/" 5 | schedule: 6 | interval: weekly 7 | time: "17:00" 8 | timezone: Europe/London 9 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, cache/restore them, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | node-version: [18.x, 20.x] 20 | # See supported Node.js release schedule at https://nodejs.org/en/about/releases/ 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Use Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v2 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | cache: 'npm' 29 | - run: npm ci 30 | # - run: npm run build --if-present 31 | - run: npm test 32 | - name: Upload coverage to Codecov 33 | uses: codecov/codecov-action@v4 34 | with: 35 | token: ${{ secrets.CODECOV_TOKEN }} 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # Compiled binary addons (http://nodejs.org/api/addons.html) 20 | build/Release 21 | 22 | # Dependency directory 23 | # Commenting this out is preferred by some people, see 24 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git- 25 | node_modules 26 | 27 | # Users Environment Variables 28 | .lock-wscript 29 | .vagrant 30 | crawl.js 31 | .DS_Store 32 | 33 | .env 34 | tmp/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # :octocat: 🕷 🕸 GitHub Scraper 4 | 5 | Learn how to parse the DOM of a web page 6 | by using your favourite coding community as an example. 7 | 8 | [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/nelsonic/github-scraper/ci.yml?label=build&style=flat-square&branch=main)](https://github.com/nelsonic/github-scraper/actions) 9 | [![codecov.io](https://img.shields.io/codecov/c/github/nelsonic/github-scraper/master.svg?style=flat-square)](http://codecov.io/github/nelsonic/github-scraper?branch=master) 10 | [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat-square)](https://github.com/nelsonic/github-scraper/issues) 11 | [![HitCount](https://hits.dwyl.com/nelsonic/github-scraper.svg)](https://hits.dwyl.com/nelsonic/github-scraper) 12 | [![npm package version](https://img.shields.io/npm/v/github-scraper.svg?color=brightgreen&style=flat-square)](https://www.npmjs.com/package/github-scraper) 13 | 18 | 19 | 21 | Step one: learn JavaScript! 23 | 24 | 25 |
26 | 27 | # ⚠️ Disclaimer / Warning! 28 | 29 | This repository/project is intended for 30 | ***Educational Purposes*** **ONLY**.
31 | The project and corresponding NPM module should not 32 | be used for any purpose other than *learning*. 33 | Please do not use it for any other reason 34 | than to learn _about_ DOM parsing 35 | and _definitely_ don't _depend_ on it for anything important! 36 | 37 | The nature of DOM parsing is that when the HTML/UI changes, 38 | the parser will inevitably fail ... 39 | GitHub have every right to change/improve their UI as they see fit. 40 | When they do change their UI the scraper will _inevitably_ "_break_"! 41 | We have [Travis-CI](https://travis-ci.org/nelsonic/github-scraper) 42 | continuous integration to run our tests precisely 43 | to _check_ that parsers for the various pages are working as expected. 44 | You can run the tests locally too, 45 | see 46 | ["Run The Tests"](https://github.com/nelsonic/github-scraper#3-run-the-tests) 47 | section below. 48 | 49 | ## Why? 50 | 51 | Our _initial reason_ for writing this set of scrapers was to satisfy the _curiosity_ / _question_: 52 | > _How_ can we ***discover*** which are the ***interesting people and projects 53 | on GitHub*** 54 | (_without **manually** checking *dozens* of GitHub profiles/repositories each day_) ? 55 | 56 | Our _second reason_ for scraping data from GitHub is so that we can show people a "*summary view*" of all their issues in our [Tudo](https://github.com/dwyl/tudo) project (which helps people track/manage/organise/prioritise their GitHub issues). 57 | See: https://github.com/dwyl/tudo/issues/51 58 | 59 | We needed a _simple_ way of systematically getting data from GitHub (_before people authenticate_) and scraping is the only way we could think of. 60 | 61 | We _tried_ using the [GitHub ***API***](https://developer.github.com/v3/) 62 | to get records from GitHub, but sadly, 63 | it has quite a few limitations (see: "_Issues with GitHub API_" section below) the biggest limitation being the [_rate-limiting_](https://developer.github.com/v3/#rate-limiting) on API requests. 64 | 65 | Thirdly we're building this project to [***scratch our own itch***](https://gettingreal.37signals.com/ch02_Whats_Your_Problem.php) 66 | ... scraping the _pages_ of GitHub has given us a _unique_ insight into the features of the platform which has leveled-up our skills. 67 | 68 | > Don't *you* want to know ***what's "Hot" right now on GitHub***...? 69 | 70 | 71 | ## What (*Problem* are we _trying_ to Solve)? 72 | 73 | Having a way of extracting the *essential* data from GitHub 74 | is a solution to a _surprisingly **wide array of problems**_, here are a few: 75 | 76 | + ***Who*** are the up-and-comming people (_worth following_) on GitHub? 77 | + ***Which*** are the ***interesting projects*** (*and why?!*) 78 | + ***What*** is the average age of an issue for a project? 79 | + Is a project's ***popularity growing*** or *plateaued*? 80 | + Are there (_already_) any ***similar projects*** to what I'm trying to build? (_reduce duplication of effort which is rampant in Open Source!!_) 81 | + How many projects get started but never finished? 82 | + ***Will*** my **Pull Request** *ever* get *merged* or is the module maintainer *too busy* and did I just [***waste 3 hours***](https://twitter.com/nelsonic/status/621984170353524736)? 83 | + _insert **your idea/problem** here_ ... 84 | + **Associative Lists** e.g: People who starred `abc` also liked `xyz` 85 | 86 | 87 | # How? 88 | 89 | This module fetches (_public_) pages from GitHub, "[_scrapes_](https://en.wikipedia.org/wiki/Web_scraping)" the html to extract raw data and returns a JSON Object. 90 | 91 | # Usage 92 | 93 | ## install from NPM 94 | 95 | install from npm and save to your `package.json`: 96 | 97 | ```sh 98 | npm install github-scraper --save 99 | ``` 100 | 101 | ## Use it in your script! 102 | 103 | ```js 104 | var gs = require('github-scraper'); 105 | var url = '/iteles' // a random username 106 | gs(url, function(err, data) { 107 | console.log(data); // or what ever you want to do with the data 108 | }) 109 | ``` 110 | 111 | ## Example URLs and Output 112 | 113 | ### Profile Page 114 | 115 | User profile has the following format `https://github.com/{username}` 116 | example: [https://github.com/**iteles**](https://github.com/iteles) 117 | 118 | ![iteles-github-profile-april-2019-annotated](https://user-images.githubusercontent.com/194400/56076833-3deafd00-5dcd-11e9-87b0-693341a0ff64.png) 119 | 120 | 121 | ```js 122 | var gs = require('github-scraper'); // require the module 123 | var url = 'alanshaw' // a random username (of someone you should follow!) 124 | gs(url, function(err, data) { 125 | console.log(data); // or what ever you want to do with the data 126 | }) 127 | ``` 128 | 129 | Sample output: 130 | 131 | ```json 132 | { 133 | "type": "profile", 134 | "url": "/iteles", 135 | "avatar": "https://avatars1.githubusercontent.com/u/4185328?s=400&v=4", 136 | "name": "Ines Teles Correia", 137 | "username": "iteles", 138 | "bio": "Co-founder @dwyl | Head cheerleader @foundersandcoders", 139 | "uid": 4185328, 140 | "worksfor": "@dwyl", 141 | "location": "London, UK", 142 | "website": "http://www.twitter.com/iteles", 143 | "orgs": { 144 | "bowlingjs": "https://avatars3.githubusercontent.com/u/8825909?s=70&v=4", 145 | "foundersandcoders": "https://avatars3.githubusercontent.com/u/9970257?s=70&v=4", 146 | "docdis": "https://avatars0.githubusercontent.com/u/10836426?s=70&v=4", 147 | "dwyl": "https://avatars2.githubusercontent.com/u/11708465?s=70&v=4", 148 | "ladiesofcode": "https://avatars0.githubusercontent.com/u/16606192?s=70&v=4", 149 | "TheScienceMuseum": "https://avatars0.githubusercontent.com/u/16609662?s=70&v=4", 150 | "SafeLives": "https://avatars2.githubusercontent.com/u/20841400?s=70&v=4" 151 | }, 152 | "repos": 28, 153 | "projects": 0, 154 | "stars": 453, 155 | "followers": 341, 156 | "following": 75, 157 | "pinned": [ 158 | { "url": "/dwyl/start-here" }, 159 | { "url": "/dwyl/learn-tdd" }, 160 | { "url": "/dwyl/learn-elm-architecture-in-javascript" }, 161 | { "url": "/dwyl/tachyons-bootstrap" }, 162 | { "url": "/dwyl/learn-ab-and-multivariate-testing" }, 163 | { "url": "/dwyl/learn-elixir" } 164 | ], 165 | "contribs": 878, 166 | "contrib_matrix": { 167 | "2018-04-08": { "fill": "#c6e48b", "count": 1, "x": "13", "y": "0" }, 168 | "2018-04-09": { "fill": "#c6e48b", "count": 2, "x": "13", "y": "12" }, 169 | "2018-04-10": { "fill": "#7bc96f", "count": 3, "x": "13", "y": "24" }, 170 | ...etc... 171 | "2019-04-11": { "fill": "#c6e48b", "count": 1, "x": "-39", "y": "48" }, 172 | "2019-04-12": { "fill": "#7bc96f", "count": 5, "x": "-39", "y": "60"} 173 | } 174 | } 175 | ``` 176 | 177 | ### Followers 178 | 179 | How many people are following a given person on Github. 180 | Url format: `https://github.com/{username}/followers` 181 | example: [https://github.com/iteles/**followers**](https://github.com/iteles/followers) 182 | 183 | ```js 184 | var gs = require('github-scraper'); // require the module 185 | var url = 'iteles/followers' // a random username (of someone you should follow!) 186 | gs(url, function(err, data) { 187 | console.log(data); // or what ever you want to do with the data 188 | }) 189 | ``` 190 | 191 | Sample output: 192 | 193 | ```js 194 | { entries: 195 | [ 'tunnckoCore', 'OguzhanE', 'minaorangina', 'Jasonspd', 'muntasirsyed', 'fmoliveira', 'nofootnotes', 196 | 'SimonLab', 'Danwhy', 'kbocz', 'cusspvz', 'RabeaGleissner', 'beejhuff', 'heron2014', 'joshpitzalis', 197 | 'rub1e', 'nikhilaravi', 'msmichellegar', 'anthonybrown', 'miglen', 'shterev', 'NataliaLKB', 198 | 'ricardofbarros', 'boymanjor', 'asimjaved', 'amilvasishtha', 'Subhan786', 'Neats29', 'lottie-em', 199 | 'rorysedgwick', 'izaakrogan', 'oluoluoxenfree', 'markwilliamfirth', 'bmordan', 'nodeco', 'besarthoxhaj', 200 | 'FilWisher', 'maryams', 'sofer', 'joaquimserafim', 'vs4vijay', 'intool', 'edwardcodes', 'hyprstack', 201 | 'nelsonic' ], 202 | url: 'https://github.com/iteles/followers' } 203 | ok 1 iteles/followers count: 45 204 | ``` 205 | 206 | If the person has ***more than 51 followers*** they will have multiple pages of followers. 207 | The data will have a **next_page** key with a value such as: 208 | [/nelsonic/followers?**page=2**](https://github.com/nelsonic/followers?page=2) 209 | If you want to keep fetching these subsequent pages of followers, 210 | simply keep running the scraper: 211 | e.g: 212 | 213 | ```js 214 | var url = 'alanshaw/followers' // a random username (of someone you should follow!) 215 | gs(url, function(err, data) { 216 | console.log(data); // or what ever you want to do with the data 217 | if(data.next_page) { 218 | gs(data.next_page, function(err2, data2) { 219 | console.log(data2); // etc. 220 | }) 221 | } 222 | }) 223 | ``` 224 | 225 | ### **Following** 226 | Want to know the list of people this person is `following` that's *easy* too! 227 | The url format is: `https://github.com/{username}/following` 228 | e.g: [https://github.com/iteles/**following**](https://github.com/iteles/following) or 229 | [https://github.com/nelsonic/following?**page=2**](https://github.com/nelsonic/following?page=2) 230 | (_where the person is following more than 51 people_ ...) 231 | 232 | Usage format is *identical* to `followers` (above) so here's an example 233 | of fetching page 3 of the results: 234 | 235 | ```js 236 | var gs = require('github-scraper'); // require the module 237 | var url = 'nelsonic/following?page=3' // a random dude 238 | gs(url, function(err, data) { 239 | console.log(data); // or what ever you want to do with the data 240 | }) 241 | ``` 242 | 243 | Sample output: 244 | 245 | ```js 246 | { 247 | entries: 248 | [ 'kytwb', 'dexda', 'arrival', 'jinnjuice', 'slattery', 'unixarcade', 'a-c-m', 'krosti', 249 | 'simonmcmanus', 'jupiter', 'capaj', 'cowenld', 'FilWisher', 'tsop14', 'NataliaLKB', 250 | 'izaakrogan', 'lynnaloo', 'nvcexploder', 'cwaring', 'missinglink', 'alanshaw', 'olizilla', 251 | 'tancredi', 'Ericat', 'pgte' 'hyprstack', 'iteles' ], 252 | url: 'https://github.com/nelsonic/following?page=3', 253 | next_page: 'https://github.com/nelsonic/following?page=4' 254 | } 255 | ``` 256 | 257 | ### Starred Repositories 258 | 259 | The list of projects a person has *starred* a fascinating source of insight. 260 | url format: https://github.com/stars/{username} 261 | e.g: [/stars/iteles](https://github.com/stars/iteles) 262 | 263 | ```js 264 | var gs = require('github-scraper'); // require the module 265 | var url = 'stars/iteles'; // starred repos for this user 266 | gs(url, function(err, data) { 267 | console.log(data); // or what ever you want to do with the data 268 | }) 269 | ``` 270 | 271 | Sample output: 272 | 273 | ```js 274 | { 275 | entries: 276 | [ '/dwyl/repo-badges', '/nelsonic/learn-testling', '/joshpitzalis/testing', '/gmarena/gmarena.github.io', 277 | '/dwyl/alc', '/nikhilaravi/fac5-frontend', '/foundersandcoders/dossier', '/nelsonic/health', '/dwyl/alvo', 278 | '/marmelab/gremlins.js', '/docdis/learn-saucelabs', '/rogerdudler/git-guide', '/tableflip/guvnor', 279 | '/dwyl/learn-redis', '/foundersandcoders/playbook', '/MIJOTHY/FOR_FLUX_SAKE', '/NataliaLKB/learn-git-basics', 280 | '/nelsonic/liso', '/dwyl/learn-json-web-tokens', '/dwyl/hapi-auth-jwt2', '/dwyl/start-here', 281 | '/arvida/emoji-cheat-sheet.com', '/dwyl/time', '/docdis/learn-react', '/dwyl/esta', '/alanshaw/meteor-foam', 282 | '/alanshaw/stylist', '/meteor-velocity/velocity', '/0nn0/terminal-mac-cheatsheet', 283 | '/bowlingjs/bowlingjs.github.io' ], 284 | url: 'https://github.com/stars/iteles?direction=desc&page=2&sort=created', 285 | next_page: 'https://github.com/stars/iteles?direction=desc&page=3&sort=created' 286 | } 287 | ``` 288 | 289 | ### Repositories 290 | 291 | The second tab on the personal profile page is "Repositories" 292 | this is a **list** of the ***personal projects*** the person is working on, e.g: https://github.com/iteles?tab=repositories 293 | 294 | github-ines-list-of-repositories 295 | 296 | We crawl this page and return an array containing the repo properties: 297 | 298 | ```js 299 | var url = 'iteles?tab=repositories'; 300 | gs(url, function(err, data) { 301 | console.log(data); // or what ever you want to do with the data 302 | }) 303 | ``` 304 | 305 | sample output: 306 | 307 | ```js 308 | { 309 | entries: [ 310 | { url: '/iteles/learn-ab-and-multivariate-testing', 311 | name: 'learn-ab-and-multivariate-testing', 312 | lang: '', 313 | desc: 'Tutorial on A/B and multivariate testing', 314 | info: '', 315 | stars: '4', 316 | forks: '0', 317 | updated: '2015-07-08T08:36:37Z' }, 318 | { url: '/iteles/learn-tdd', 319 | name: 'learn-tdd', 320 | lang: 'JavaScript', 321 | desc: 'A brief introduction to Test Driven Development (TDD) in JavaScript', 322 | info: 'forked from dwyl/learn-tdd', 323 | stars: '0', 324 | forks: '4', 325 | updated: '2015-06-29T17:24:56Z' }, 326 | { url: '/iteles/practical-full-stack-testing', 327 | name: 'practical-full-stack-testing', 328 | lang: 'HTML', 329 | desc: 'A fork of @nelsonic\'s repo to allow for PRs', 330 | info: 'forked from nelsonic/practical-js-tdd', 331 | stars: '0', 332 | forks: '36', 333 | updated: '2015-06-06T14:40:43Z' }, 334 | { url: '/iteles/styling-for-accessibility', 335 | name: 'styling-for-accessibility', 336 | lang: '', 337 | desc: 'A collection of \'do\'s and \'don\'t\'s of CSS to ensure accessibility', 338 | info: '', 339 | stars: '0', 340 | forks: '0', 341 | updated: '2015-05-26T11:06:28Z' }, 342 | { url: '/iteles/Ultimate-guide-to-successful-meetups', 343 | name: 'Ultimate-guide-to-successful-meetups', 344 | lang: '', 345 | desc: 'The ultimate guide to organizing successful meetups', 346 | info: '', 347 | stars: '3', 348 | forks: '0', 349 | updated: '2015-05-19T09:40:39Z' }, 350 | { url: '/iteles/Javascript-the-Good-Parts-notes', 351 | name: 'Javascript-the-Good-Parts-notes', 352 | lang: '', 353 | desc: 'Notes on the seminal "Javascript the Good Parts: byDouglas Crockford', 354 | info: '', 355 | stars: '41', 356 | forks: '12', 357 | updated: '2015-05-17T16:39:35Z' } 358 | ], 359 | url: 'https://github.com/iteles?tab=repositories' } 360 | ``` 361 | 362 | 363 | ### Activity feed 364 | 365 | Every person on GitHub has an RSS feed for their recent activity; 366 | this is the 3rd and final tab of the person's profile page. 367 | 368 | it can be viewed online by visiting: 369 | ```sh 370 | https://github.com/{username}?tab=activity 371 | ``` 372 | e.g: [/iteles?tab=activity](https://github.com/iteles?tab=activity) 373 | 374 | 375 | #### Parsing the Feed 376 | 377 | The activity feed is published as an [**.atom**](https://en.wikipedia.org/wiki/RSS) 378 | xml string which contains a list of entries. 379 | 380 | We use [**xml2js**](https://www.npmjs.com/package/xml2js) 381 | (which in turn uses the [**sax**](https://www.npmjs.com/package/sax) xml parser) to parse the xml stream. This results in a object similar to the following example: 382 | 383 | ```js 384 | { '$': 385 | { xmlns: 'http://www.w3.org/2005/Atom', 386 | 'xmlns:media': 'http://search.yahoo.com/mrss/', 387 | 'xml:lang': 'en-US' }, 388 | id: [ 'tag:github.com,2008:/iteles' ], 389 | link: [ { '$': [Object] }, { '$': [Object] } ], 390 | title: [ 'iteles’s Activity' ], 391 | updated: [ '2015-07-22T23:31:25Z' ], 392 | entry: 393 | [ { id: [Object], 394 | published: [Object], 395 | updated: [Object], 396 | link: [Object], 397 | title: [Object], 398 | author: [Object], 399 | 'media:thumbnail': [Object], 400 | content: [Object] }, 401 | { id: [Object], 402 | published: [Object], 403 | updated: [Object], 404 | link: [Object], 405 | title: [Object], 406 | author: [Object], 407 | 'media:thumbnail': [Object], 408 | content: [Object] } 409 | ] 410 | } 411 | ``` 412 | Each call to the atom feed returns the latest 30 enties. 413 | We're showing 2 here for illustration (_so you get the idea..._) 414 | 415 | From this we _extract_ only the relevant info: 416 | 417 | ```sh 418 | '2015-07-22T12:33:14Z alanshaw pushed to master at alanshaw/david-www', 419 | '2015-07-22T12:33:14Z alanshaw created tag v9.4.3 at alanshaw/david-www', 420 | '2015-07-22T09:23:28Z alanshaw closed issue tableflip/i18n-browserify#6', 421 | '2015-07-21T17:08:19Z alanshaw commented on issue alanshaw/david#71', 422 | '2015-07-21T08:24:13Z alanshaw pushed to master at tableflip/score-board', 423 | '2015-07-20T17:49:59Z alanshaw deleted branch refactor-corp-events at tableflip/sow-api-client', 424 | '2015-07-20T17:49:58Z alanshaw pushed to master at tableflip/sow-api-client', 425 | '2015-07-20T17:49:58Z alanshaw merged pull request tableflip/sow-api-client#2', 426 | '2015-07-20T17:49:54Z alanshaw opened pull request tableflip/sow-api-client#2', 427 | '2015-07-18T07:30:36Z alanshaw closed issue alanshaw/md-tokenizer#1', 428 | '2015-07-18T07:30:36Z alanshaw commented on issue alanshaw/md-tokenizer#1', 429 | ``` 430 | Instead of _wasting_ (_what will be **Giga**_) ***Bytes*** of space with key:value pairs by storing the entries as JSON, we are storing the activity feed entries as strings in an array. 431 | Each item in the array can be broken down into: 432 | ```sh 433 | {date-time} {username} {action} {link} 434 | ``` 435 | 436 | As we can see from this there are several event types: 437 | 438 | + **pushed to master** at 439 | + **created tag** v9.4.3 at 440 | + **opened issue** 441 | + **commented on issue** 442 | + **closed issue** 443 | + **deleted branch** 444 | + **opened pull request** 445 | + **merged pull request** 446 | + **starred** username/repo-name 447 | 448 | For now we are *not* going to parse the event types, we are simply going to store them in our list for later analysis. 449 | 450 | We have a good pointer when its time to start interpreting the data: 451 | https://developer.github.com/v3/activity/events/types/ 452 | 453 | One thing worth noting is that RSS feed is ***Not Real-Time*** ... 454 | sadly, it only gets updated periodically so we cannot rely on it to 455 | have the *latest* info. 456 | 457 | 458 | ### Organization 459 | 460 | Organization pages have the following url pattern: `https://github.com/{orgname}` 461 | example: [https://github.com/**dwyl**](https://github.com/dwyl) 462 | 463 | ```js 464 | var url = 'dwyl'; 465 | gs(url, function(err, data) { 466 | console.log(data); // or do something way more interesting with the data! 467 | }); 468 | ``` 469 | 470 | sample data (`entries` _truncated for brevity_): 471 | ```js 472 | { 473 | entries: 474 | [ { name: 'hapi-auth-jwt2', 475 | desc: 'Secure Hapi.js authentication plugin using JSON Web Tokens (JWT)', 476 | updated: '2015-08-04T19:30:50Z', 477 | lang: 'JavaScript', 478 | stars: '59', 479 | forks: '11' }, 480 | { name: 'start-here', 481 | desc: 'A Quick-start Guide for People who want to DWYL', 482 | updated: '2015-08-03T11:04:14Z', 483 | lang: 'HTML', 484 | stars: '14', 485 | forks: '9' }, 486 | { name: 'summer-2015', 487 | desc: 'Probably the best Summer Sun, Fun & Coding Experience in the World!', 488 | updated: '2015-07-31T11:02:29Z', 489 | lang: 'CSS', 490 | stars: '16', 491 | forks: '1' }, 492 | ], 493 | website: 'http://dwyl.io', 494 | url: 'https://github.com/dwyl', 495 | name: 'dwyl - do what you love', 496 | desc: 'Start here: https://github.com/dwyl/start-here', 497 | location: 'Your Pocket', 498 | email: 'github@dwyl.io', 499 | pcount: 24, 500 | avatar: 'https://avatars3.githubusercontent.com/u/11708465?v=3&s=200', 501 | next_page: '/dwyl?page=2' 502 | } 503 | ``` 504 | Note #1: *sadly*, this has the ***identical*** url format to *Profile* 505 | this gets handled by the `switcher` which infers what is an org vs. profile page 506 | by checking for an known element on the page. 507 | 508 | Note #2: when an organization has *multiple pages* of repositories you will see a `next_page` 509 | key/value in the `data` e.g: [/dwyl?**page=2**](/dwyl?page=2) (for the second page of repos) 510 | 511 | 512 | ### Repository Stats 513 | 514 | This is where things start getting interesting ... 515 | 516 | ![github-repo-page](https://cloud.githubusercontent.com/assets/194400/8930109/d8a76ab8-3522-11e5-8e07-95596a889fde.png) 517 | 518 | example: https://github.com/nelsonic/adoro 519 | 520 | ```js 521 | var url = 'nelsonic/adoro'; 522 | gs(url, function(err, data) { 523 | console.log(data); // or do something way more interesting with the data! 524 | }); 525 | ``` 526 | 527 | sample data: 528 | 529 | ```js 530 | { 531 | url: 'https://github.com/nelsonic/adoro', 532 | desc: 'The little publishing tool you\'ll love using. [work-in-progress]', 533 | website: 'http://www.dwyl.io/', 534 | watchers: 3, 535 | stars: 8, 536 | forks: 1, 537 | commits: 12, 538 | branches: 1, 539 | releases: 1, 540 | langs: [ 'JavaScript 90.7%', 'CSS 9.3%' ] 541 | } 542 | ``` 543 | 544 | > Annoyingly the number of issues and pull requests, contributors and issues 545 | are only rendered *after* the page has loaded (via XHR) so we do not get 546 | these three stats on page load. 547 | 548 | 549 | ### 7. Issues 550 | 551 | Clicking on the issues icon/link in any repository takes us to the list of all the issues. 552 | 553 | Visiting a project with more than a page worth of issues has pagination at the bottom of the page: 554 | 555 | ![tudo-issues-list-showing-pagination](https://cloud.githubusercontent.com/assets/194400/8942419/27b9446a-356d-11e5-84f9-5de2eaae506b.png) 556 | 557 | Which has a link to: https://github.com/dwyl/tudo/issues?page=2&q=is%3Aissue+is%3Aopen 558 | 559 | ![tudo-issues-second-page](https://cloud.githubusercontent.com/assets/194400/8942423/33bf0a2e-356d-11e5-82b8-1bd142fb2302.png) 560 | 561 | List of issues for a repository: 562 | 563 | ```js 564 | var gs = require('github-scraper'); 565 | var url = '/dwyl/tudo/issues'; 566 | gs(url, function (err, data) { 567 | console.log(data); // use the data how ever you like 568 | }); 569 | ``` 570 | 571 | sample output: 572 | 573 | ```sh 574 | { entries: 575 | [ 576 | { 577 | url: '/dwyl/tudo/issues/46', 578 | title: 'discuss components', 579 | created: '2015-07-21T15:34:22Z', 580 | author: 'benjaminlees', 581 | comments: 3, 582 | assignee: 'izaakrogan', 583 | milestone: 'I don\'t know what I\'m doing', 584 | labels: [ 'enhancement', 'help wanted', 'question' ] 585 | }, 586 | { 587 | url: '/dwyl/tudo/issues/45', 588 | title: 'Create riot components from HTML structure files', 589 | created: '2015-07-21T15:24:58Z', 590 | author: 'msmichellegar', 591 | comments: 2, 592 | assignee: 'msmichellegar', 593 | labels: [ 'question' ] 594 | } 595 | ], // truncated for brevity 596 | open: 30, 597 | closed: 20, 598 | next: '/dwyl/tudo/issues?page=2&q=is%3Aissue+is%3Aopen', 599 | url: '/dwyl/tudo/issues' 600 | } 601 | ``` 602 | 603 | Each issue in the list would create a entry in the crawler (worker) queue: 604 | 605 | ```sh 606 | 2015-07-22T12:33:14Z issue /dwyl/tudo/issues/77 607 | ``` 608 | 609 | > Should we include the "all issues by this author" link? 610 | + **created_by** https://github.com/dwyl/tudo/issues/created_by/iteles 611 | + **assignee** (assigned to): https://github.com/dwyl/tudo/issues?q=assignee%3Aiteles+is%3Aopen 612 | 613 | 614 | ### Issue (_individual_) 615 | 616 | The result of scraping https://github.com/dwyl/tudo/issues/51 617 | 618 | ```js 619 | var gs = require('github-scraper'); 620 | var url = '/dwyl/tudo/issues/51'; 621 | gs(url, function (err, data) { 622 | console.log(data); // use the data how ever you like 623 | }); 624 | ``` 625 | 626 | sample output: 627 | 628 | ```js 629 | { entries: 630 | [ { id: 'issue-96442793', 631 | author: 'nelsonic', 632 | created: '2015-07-22T00:00:45Z', 633 | body: 'instead of waiting for people to perform the steps to authorise Tudo (to access their GitHub orgs/issues we could request their GitHub username on the login page and initiate the retrieval of their issues while they are authenticating... That way, by the time they get back to Tudo their issues dashboard is already pre-rendered and loaded! This is a wow-factor people won\'t be expecting and thus our app immediately delivers on our first promise!\n\nThoughts?' }, 634 | { id: 'issuecomment-123807796', 635 | author: 'iteles', 636 | created: '2015-07-22T17:54:12Z', 637 | body: 'I\'d love to test this out, this will be an amazing selling point if we can get the performance to work like we expect!' }, 638 | { id: 'issuecomment-124048121', 639 | author: 'nelsonic', 640 | created: '2015-07-23T10:20:15Z', 641 | body: '@iteles have you watched the Foundation Episode featuring Kevin Systrom (instagram) ?\n\n\nhttps://www.youtube.com/watch?v=nld8B9l1aRE\n\n\nWhat were the USPs that contributed to instagram\'s success (considering how many photo-related-apps were in the app store at the time) ?\n\ncc: @besarthoxhaj' }, 642 | { id: 'issuecomment-124075792', 643 | author: 'besarthoxhaj', 644 | created: '2015-07-23T11:59:31Z', 645 | body: '@nelsonic love the idea! Let\'s do it!' } ], 646 | labels: [ 'enhancement', 'help wanted', 'question' ], 647 | participants: [ 'nelsonic', 'iteles', 'besarthoxhaj' ], 648 | url: '/dwyl/tudo/issues/51', 649 | title: 'Pre-fetch people\'s issues while they are authenticating with GitHub', 650 | state: 'Open', 651 | author: 'nelsonic', 652 | created: '2015-07-22T00:00:45Z', 653 | milestone: 'Minimal Usable Product', 654 | assignee: 'besarthoxhaj' } 655 | ``` 656 | 657 | By contrast using the GitHub API to fetch this issue 658 | see: https://developer.github.com/v3/issues/#get-a-single-issue 659 | 660 | format: 661 | ```sh 662 | /repos/:owner/:repo/issues/:number 663 | ``` 664 | 665 | ```sh 666 | curl https://api.github.com/repos/dwyl/tudo/issues/51 667 | ``` 668 | 669 | ### Milestones 670 | 671 | Milestones are used to group issues into logical units. 672 | 673 | ![dwyl-tudo-milestones](https://cloud.githubusercontent.com/assets/194400/9010055/b3e4da72-379c-11e5-8fd3-680bf928a389.png) 674 | 675 | ```js 676 | var gs = require('github-scraper'); 677 | var url = '/dwyl/tudo/milestones'; 678 | gs(url, function (err, data) { 679 | console.log(data); // use the data how ever you like 680 | }); 681 | ``` 682 | 683 | Sample output: 684 | 685 | ```js 686 | { entries: 687 | [ { name: 'Test Milestone - Please Don\'t Close!', 688 | due: 'Past due by 16 days', 689 | updated: 'Last updated 5 days ago', 690 | desc: 'This Milestone in used in our e2e tests to check for an over-due milestone, so please don\'t close it!', 691 | progress: '0%', 692 | open: 1, 693 | closed: 0 }, 694 | { name: 'Minimal Usable Product', 695 | due: 'Due by July 5, 2016', 696 | updated: 'Last updated 2 days ago', 697 | desc: 'What is the absolute minimum we can do to deliver value to people using the app?\n(and thus make them want to come back and use it!)', 698 | progress: '0%', 699 | open: 5, 700 | closed: 0 } ], 701 | url: 'https://github.com/dwyl/tudo/milestones', 702 | open: 2, 703 | closed: 1 } 704 | ``` 705 | 706 | ### Labels (for a repository) 707 | 708 | All repositories have a set of standard labels (built-in to GitHub) 709 | e.g: https://github.com/dwyl/tudo/labels is (_currently_) only using the "*standard*" labels. 710 | 711 | github-dwyl-tudo-labels-list 712 | 713 | Whereas the RethinkDB (which uses GitHub for all their project tracking) uses _several **custom labels**_: 714 | https://github.com/rethinkdb/rethinkdb/labels 715 | 716 | github-rethinkdb-issues-list 717 | 718 | We need to crawl these for each repo. 719 | 720 | ```js 721 | var gs = require('github-scraper'); 722 | var url = '/dwyl/time/labels'; 723 | gs(url, function (err, data) { 724 | console.log(data); // use the data how ever you like 725 | }); 726 | ``` 727 | 728 | Here's the extraction of the standard labels: 729 | ```js 730 | [ 731 | { name: 'bug', 732 | style: 'background-color: #fc2929; color: #fff;', 733 | link: '/dwyl/tudo/labels/bug', 734 | count: 3 }, 735 | { name: 'duplicate', 736 | style: 'background-color: #cccccc; color: #333333;', 737 | link: '/dwyl/tudo/labels/duplicate', 738 | count: 0 }, 739 | { name: 'enhancement', 740 | style: 'background-color: #84b6eb; color: #1c2733;', 741 | link: '/dwyl/tudo/labels/enhancement', 742 | count: 11 }, 743 | { name: 'help wanted', 744 | style: 'background-color: #159818; color: #fff;', 745 | link: '/dwyl/tudo/labels/help%20wanted', 746 | count: 21 }, 747 | { name: 'invalid', 748 | style: 'background-color: #e6e6e6; color: #333333;', 749 | link: '/dwyl/tudo/labels/invalid', 750 | count: 1 }, 751 | { name: 'question', 752 | style: 'background-color: #cc317c; color: #fff;', 753 | link: '/dwyl/tudo/labels/question', 754 | count: 10 } 755 | ] 756 | ``` 757 | 758 | or a repo that has ***custom labels***: 759 | 760 | ```js 761 | { entries: 762 | [ { name: '[alpha]', 763 | style: 'background-color: #79CDCD; color: #1e3333;', 764 | link: '/dwyl/time/labels/%5Balpha%5D', 765 | count: 2 }, 766 | { name: 'API', 767 | style: 'background-color: #006b75; color: #fff;', 768 | link: '/dwyl/time/labels/API', 769 | count: 11 }, 770 | { name: 'bug', 771 | style: 'background-color: #fc2929; color: #fff;', 772 | link: '/dwyl/time/labels/bug', 773 | count: 5 }, 774 | { name: 'chore', 775 | style: 'background-color: #e11d21; color: #fff;', 776 | link: '/dwyl/time/labels/chore', 777 | count: 9 }, 778 | { name: 'discuss', 779 | style: 'background-color: #bfe5bf; color: #2a332a;', 780 | link: '/dwyl/time/labels/discuss', 781 | count: 43 }, 782 | { name: 'Documentation', 783 | style: 'background-color: #eb6420; color: #fff;', 784 | link: '/dwyl/time/labels/Documentation', 785 | count: 2 }, 786 | { name: 'duplicate', 787 | style: 'background-color: #cccccc; color: #333333;', 788 | link: '/dwyl/time/labels/duplicate', 789 | count: 0 }, 790 | { name: 'enhancement', 791 | style: 'background-color: #84b6eb; color: #1c2733;', 792 | link: '/dwyl/time/labels/enhancement', 793 | count: 27 }, 794 | { name: 'external dependency', 795 | style: 'background-color: #D1EEEE; color: #2c3333;', 796 | link: '/dwyl/time/labels/external%20dependency', 797 | count: 1 }, 798 | { name: 'FrontEnd', 799 | style: 'background-color: #f7c6c7; color: #332829;', 800 | link: '/dwyl/time/labels/FrontEnd', 801 | count: 26 }, 802 | { name: 'help wanted', 803 | style: 'background-color: #009800; color: #fff;', 804 | link: '/dwyl/time/labels/help%20wanted', 805 | count: 42 }, 806 | { name: 'invalid', 807 | style: 'background-color: #e6e6e6; color: #333333;', 808 | link: '/dwyl/time/labels/invalid', 809 | count: 0 }, 810 | { name: 'investigate', 811 | style: 'background-color: #fbca04; color: #332900;', 812 | link: '/dwyl/time/labels/investigate', 813 | count: 18 }, 814 | { name: 'MVP', 815 | style: 'background-color: #207de5; color: #fff;', 816 | link: '/dwyl/time/labels/MVP', 817 | count: 27 }, 818 | { name: 'NiceToHave', 819 | style: 'background-color: #fbca04; color: #332900;', 820 | link: '/dwyl/time/labels/NiceToHave', 821 | count: 7 }, 822 | { name: 'Post MVP', 823 | style: 'background-color: #fef2c0; color: #333026;', 824 | link: '/dwyl/time/labels/Post%20MVP', 825 | count: 24 }, 826 | { name: 'question', 827 | style: 'background-color: #cc317c; color: #fff;', 828 | link: '/dwyl/time/labels/question', 829 | count: 25 }, 830 | { name: 'UI', 831 | style: 'background-color: #bfdadc; color: #2c3233;', 832 | link: '/dwyl/time/labels/UI', 833 | count: 13 } ], 834 | url: 'https://github.com/dwyl/time/labels' } 835 | ``` 836 | 837 | ### Issues > *Search* (*Bonus Feature*) 838 | 839 | A ***much*** more *effective* way of collating all the issues relevant to a person is to search for them! 840 | 841 | example: 842 | https://github.com/search?type=Issues&q=author%3Aiteles&state=open&o=desc&s=created 843 | 844 | ```js 845 | { 846 | entries: 847 | [ 848 | { title: 'Remove flexbox from CSS', 849 | url: '/dwyl/dwyl.github.io/issues/29', 850 | desc: 'To ensure the site works across all devices, particularly Kindle/e-readers.', 851 | author: 'iteles', 852 | created: '2015-07-25T22:57:20Z', 853 | comments: 2 }, 854 | { title: 'CSS | Add indentation back into main.css (disappeared from master)', 855 | url: '/dwyl/tudo/issues/77', 856 | desc: 'All indentation has been removed from main.css in the latest commit. \n\nThis needs to be put back in as originally written by @msmichellegar and @iteles.', 857 | author: 'iteles', 858 | created: '2015-07-25T16:27:59Z' }, 859 | { title: 'CSS | Investigate styling of issue label colours', 860 | url: '/dwyl/tudo/issues/72', 861 | desc: 'Labels can be given any colour so there is no predictable set that we can code into the CSS file.\n\nWe need to investigate what the best way to ensure we can provide the right colour of background to the ...', 862 | author: 'iteles', 863 | created: '2015-07-23T17:49:02Z', 864 | comments: 4 } 865 | ], 866 | next: '/search?o=desc&p=2&q=author%3Aiteles&s=created&state=open&type=Issues' 867 | } 868 | ``` 869 | 870 | 871 | #### Owner 872 | 873 | For the issues created across all their *personal* repositories 874 | use a search query of the form: 875 | ```sh 876 | https://github.com/search?q=user%3A{username|org} 877 | &state={state} 878 | &type=Issues&s={relevance} 879 | &o={order} 880 | ``` 881 | e.g: 882 | https://github.com/search?q=user%3Aiteles&state=open&type=Issues&s=updated&o=asc 883 | 884 | #### Author (_created by_) 885 | 886 | Or to find ***all*** the issues where the person is the ***author*** 887 | use a query of the following format: 888 | 889 | ```sh 890 | https://github.com/search?q=author%3A{username|org} 891 | &state={state} 892 | &type=Issues&s={relevance} 893 | &o={order} 894 | ``` 895 | 896 | #### Assignee (_issues assigned to this person_) 897 | 898 | Or to find ***all*** the issues *assigned* to the person use a query of the following format: 899 | 900 | ```sh 901 | https://github.com/search?q=assignee%3A{username|org} 902 | &state={state} 903 | &type=Issues&s={relevance} 904 | &o={order} 905 | &s={filter} 906 | ``` 907 | 908 | #### Mentions 909 | 910 | We can use a ***mentions*** (search) query to discover all the 911 | issues where a given person (_username_) was mentioned: 912 | 913 | ```sh 914 | https://github.com/search?q=mentions%3A{username}&type=Issues&state={state} 915 | ``` 916 | 917 | e.g: https://github.com/search?q=mentions%3Aiteles&type=Issues&state=open 918 | 919 | This _could_ be more than the issues in the person's (_own_) repos *or* the repos the person has access to (_via org_). e.g: 920 | if [_Sally_](http://www.imdb.com/title/tt1483013/quotes?item=qt1905812) 921 | axks a clarifying question on a project she has not yet contributed to, 922 | the issue will not appear when we crawl the repos on her profile or orgs she has access to ... 923 | 924 | #### Issues Filters 925 | 926 | There are *many* filters we can use to find issues, here are a few: 927 | 928 | + **created** https://github.com/search?q=author%3Aiteles&s=created&type=Issues&o=desc&state=open 929 | + **updated**: https://github.com/search?q=author%3Aiteles&s=updated&type=Issues&o=desc&state=open 930 | + **date range**: https://github.com/dwyl/time/issues?q=is%3Aissue+is%3Aopen+updated%3A%3C2015-06-28 931 | 932 | ##### Further Reading on Searching+Filters 933 | 934 | For *way* more details on searching & filters see: 935 | 936 | + https://help.github.com/articles/searching-issues/ 937 | + https://help.github.com/articles/searching-github/#types-of-searches 938 | + https://help.github.com/articles/search-syntax/ 939 | 940 | 941 | 942 | 943 | ## Want More Examples? 944 | 945 | If you want ***even more*** examples of the pages you can scrape, 946 | take a look at our end-to-end tests where we *test* all the scrapers! 947 | 948 |
949 | 950 | ## Future Features / Road Map ? 951 | 952 | 953 | ### Crawl the List of commits 954 | 955 | Would it be interesting to see/track: 956 | + **who** makes the most commits to the project 957 | + **when** (***what time*** of day/night) people do their work 958 | + **what** did the person contribute? (docs, code improvement, tests, typo, dependency update?) 959 | 960 | Show your interest in this feature: https://github.com/nelsonic/github-scraper/issues/17 961 | 962 |


963 | 964 | # Contributing? 965 | 966 | Contributions are _always_ welcome! 967 | We have a backlog of features (_many pages we want to parse_)
968 | please see: https://github.com/nelsonic/github-scraper/issues
969 | If anything interests you, please lave a comment on the issue. 970 | 971 | Your first step to _contributing_ to this project 972 | is to run it on your **`localhost`**. 973 | 974 | ### 1. Clone the Repository 975 | 976 | In your terminal, clone the repository from GitHub: 977 | 978 | ```sh 979 | git clone https://github.com/nelsonic/github-scraper.git && cd github-scraper 980 | ``` 981 | 982 | ### 2. Install the Dependencies 983 | 984 | Ensure you have Node.js installed, see https://nodejs.org
985 | Then run the following command to install the project dependencies: 986 | 987 | ```sh 988 | npm install 989 | ``` 990 | 991 | You should see output in your terminal similar to the following: 992 | 993 | ``` 994 | added 162 packages from 177 contributors and audited 265 packages in 4.121s 995 | ``` 996 | 997 | That tells you that the dependencies were successfully installed. 998 | 999 | 1000 | ### 3. Run the Tests 1001 | 1002 | In your terminal execute the following command: 1003 | 1004 | ```sh 1005 | npm test 1006 | ``` 1007 | 1008 | 1009 | You should see output similar to the following: 1010 | 1011 | ``` 1012 | > github-scraper@6.7.1 test /Users/n/code/github-scraper 1013 | > istanbul cover ./node_modules/tape/bin/tape ./test/*.js | node_modules/tap-spec/bin/cmd.js 1014 | 1015 | 1016 | read list of followers for @jupiter (single page of followers) 1017 | 1018 | - - - GitHub Scraper >> /jupiter/followers >> followers - - - 1019 | ✔ jupiter/followers data.type: followers 1020 | ✔ @jupiter/followers has 34 followers 1021 | ✔ Nelson in jupiter/followers 1022 | ✔ @jupiter/followers only has 1 page of followers 1023 | 1024 | read list of followers for @iteles (multi-page) 1025 | 1026 | - - - GitHub Scraper >> /iteles/followers >> followers - - - 1027 | ✔ "followers": 51 on page 1 1028 | ✔ iteles/followers multi-page followers 1029 | 1030 | 1031 | ... etc ... 1032 | 1033 | ============================================================================= 1034 | Writing coverage object [/Users/n/code/github-scraper/coverage/coverage.json] 1035 | Writing coverage reports at [/Users/n/code/github-scraper/coverage] 1036 | ============================================================================= 1037 | =============================== Coverage summary =============================== 1038 | Statements : 100% ( 192/192 ) 1039 | Branches : 100% ( 63/63 ) 1040 | Functions : 100% ( 22/22 ) 1041 | Lines : 100% ( 192/192 ) 1042 | ================================================================================ 1043 | 1044 | 1045 | total: 102 1046 | passing: 102 1047 | duration: 31.6s 1048 | ``` 1049 | 1050 | The tests take around 30 seconds to run on _my_ `localhost`, 1051 | but your test execution time will vary depending on your location 1052 | (_the further you are from GitHub's servers the slower the tests will run..._). 1053 | 1054 | Don't panic if you see some red in your terminal while the tests are running. 1055 | We have to simulate failure `404` and `403` errors 1056 | to ensure that we can handle them. 1057 | Pages some times disappear 1058 | e.g: a user leaves GitHub or deletes a project. 1059 | And our script needs to not freak out when that happens. 1060 | This is good practice in DOM parsing, the web changes a _lot_! 1061 | 1062 | When the tests _pass_ on your `localhost`, 1063 | you know everything is working as expected. 1064 | Time to move on to the fun bit! 1065 | 1066 | > **Note**: This project follows Test Driven Development (TDD) 1067 | because it's the only way we can maintain our sanity ... 1068 | If we didn't have tests it would be _chaos_ 1069 | and _everything_ would "break" all the time. 1070 | If you are contributing to the project, 1071 | please be aware that tests are required 1072 | and any Pull Requests without tests will not be considered. 1073 | (_please don't take it personally, it's just a rule we have_).
1074 | 1075 | If you are new to TDD, please see: 1076 | [github.com/dwyl/**learn-tdd**](https://github.com/dwyl/learn-tdd) 1077 | 1078 | 1079 | 1080 | ### 4. Pick an Issue and Write Some Code! 1081 | 1082 | Once you have the project running on your `localhost`, 1083 | it's time to pick a page to parse! 1084 | 1085 | There are a bunch of features in the backlog. see: 1086 | https://github.com/nelsonic/github-scraper/issues 1087 | 1088 | Pick one that interests you 1089 | and write a comment on it 1090 | to _show_ your interest in contributing. 1091 | 1092 | 1093 | ### Travis-CI? 1094 | 1095 | We use Travis-CI (Continuous Integration), 1096 | to ensure that our code works 1097 | and all tests _pass_ whenever a change is made to the code. 1098 | This is _essential_ in _any_ project and even more so in a DOM parsing one. 1099 | 1100 | If you are new to Travis-CI, please see: 1101 | [github.com/dwyl/**learn-travis**](https://github.com/dwyl/learn-travis) 1102 | 1103 | ### Pre-Commit Hook? 1104 | 1105 | When you attempt to commit code on your `localhost`, 1106 | the tests will run **`before`** your commit will register. 1107 | This is a precaution to ensure that the code we write is _always tested_. 1108 | There is no point writing code that is not being tested 1109 | as it will "break" almost immediately and be unmaintainable. 1110 | 1111 | Simply wait a few seconds for the tests to pass 1112 | and then push your work to GitHub. 1113 | 1114 | If you are new to pre-commit hooks, please see: 1115 | [github.com/dwyl/**learn-pre-commit**](https://github.com/dwyl/learn-pre-commit) 1116 | 1117 | 1118 |


1119 | 1120 | ## tl;dr 1121 | 1122 | If you are the kind of person that likes to *understand* how something works, 1123 | this is *your* section. 1124 | 1125 | ### Inferring Which Scraper to use from the URL 1126 | 1127 | `lib/switcher.js` handles inference. 1128 | We wanted to use a `switch > case` construct but, ended up using `if/else` 1129 | because there are two types of checks we need to do so `if/else` seemed simpler. 1130 | 1131 | 1132 | ## Interesting Facts 1133 | 1134 | - GitHub has 10.3 Million users (_at last count_) 1135 | - yet the most followed person [Linus Torvalds](https://github.com/torvalds) 1136 | "_only_" has **28k followers** (_so its a **highly distributed network**_ ) 1137 | + https://www.githubarchive.org/ attempts to archive all of GitHub 1138 | + http://octoboard.com/ shows stats for the past 24h 1139 | 1140 | 1141 | ## Research 1142 | 1143 | > Must read up about http://en.wikipedia.org/wiki/Inverted_index 1144 | > so I understand how to use: https://www.npmjs.org/package/level-inverted-index 1145 | 1146 | - GitHub stats (node module): https://github.com/apiengine/ghstats 1147 | (no tests or recent work/activity, but interesting functionality) 1148 | 1149 | - Hard Drive reliability stats: 1150 | https://www.backblaze.com/blog/hard-drive-reliability-update-september-2014 1151 | (useful when selecting which drives to use in the storage array - 1152 | Clear Winner is Hitachi 3TB) 1153 | - RAID explained in layman's terms: 1154 | http://uk.pcmag.com/storage-devices-reviews/7917/feature/raid-levels-explained 1155 | - RAID Calculator: 1156 | https://www.synology.com/en-global/support/RAID_calculator 1157 | (if you don't already know how much space you get) 1158 | - SQLite limits: https://www.sqlite.org/limits.html 1159 | 1160 | ## Useful Links 1161 | 1162 | - Summary of ***Most Active*** GitHub users: http://git.io/top 1163 | - Intro to web-scraping with cheerio: 1164 | https://www.digitalocean.com/community/tutorials/how-to-use-node-js-request-and-cheerio-to-set-up-simple-web-scraping 1165 | - GitHub background info: http://en.wikipedia.org/wiki/GitHub 1166 | + GitHub Event Types: 1167 | https://developer.github.com/v3/activity/events/types/ 1168 | 1169 | ### GitHub Stats API 1170 | 1171 | - Github Stats API: https://developer.github.com/v3/repos/statistics/ 1172 | - GitHub Followers API: https://developer.github.com/v3/users/followers/ 1173 | 1174 | Example: 1175 | 1176 | ```sh 1177 | curl -v https://api.github.com/users/pgte/followers 1178 | ``` 1179 | 1180 | ```js 1181 | [ 1182 | { 1183 | "login": "methodmissing", 1184 | "id": 379, 1185 | "avatar_url": "https://avatars.githubusercontent.com/u/379?v=2", 1186 | "gravatar_id": "", 1187 | "url": "https://api.github.com/users/methodmissing", 1188 | "html_url": "https://github.com/methodmissing", 1189 | "followers_url": "https://api.github.com/users/methodmissing/followers", 1190 | "following_url": "https://api.github.com/users/methodmissing/following{/other_user}", 1191 | "gists_url": "https://api.github.com/users/methodmissing/gists{/gist_id}", 1192 | "starred_url": "https://api.github.com/users/methodmissing/starred{/owner}{/repo}", 1193 | "subscriptions_url": "https://api.github.com/users/methodmissing/subscriptions", 1194 | "organizations_url": "https://api.github.com/users/methodmissing/orgs", 1195 | "repos_url": "https://api.github.com/users/methodmissing/repos", 1196 | "events_url": "https://api.github.com/users/methodmissing/events{/privacy}", 1197 | "received_events_url": "https://api.github.com/users/methodmissing/received_events", 1198 | "type": "User", 1199 | "site_admin": false 1200 | }, 1201 | 1202 | etc...] 1203 | ``` 1204 | 1205 | #### Issues (with using the) GitHub API: 1206 | 1207 | - The API only returns 30 results per query. 1208 | - **X-RateLimit-Limit**: **60** (can only make 60 requests per hour) ... 1209 | 1440 queries per day (60 per hour x 24 hours) sounds like *ample* on the surface. 1210 | But, if we assume the average person has at least 2 pages worth of followers (30<) 1211 | it means on a single instance/server we can only track 720 people. 1212 | Not really enough to do any sort of trend analysis. :disappointed: 1213 | If we are tracking people with hundreds of followers (and *growing fast*) 1214 | e.g. 300< followers. the number of users we can track comes down to 1215 | 1440 / 10 = 140 people... 1216 | (10 requests to fetch complete list of followers) we burn through 1440 requests 1217 | pretty quickly. 1218 | - There's no guarantee which order the followers will be in 1219 | (e.g. most recent first?) 1220 | - **Results** are ***Cached*** so they are not-real time like they are in the 1221 | Web. (seems daft, but its true.) Ideally they would have a ***Streaming API*** 1222 | but sadly, [GitHub is built in Ruby-on-Rails](http://builtwith.com/github.com) 1223 | which is "**RESTful**" (***not real-time***). 1224 | 1225 | #### *But*... 1226 | 1227 | Once we know _who_ we *should* be following, we can use 1228 | 1229 | - https://developer.github.com/v3/users/followers/#follow-a-user 1230 | - https://developer.github.com/v3/users/followers/#check-if-one-user-follows-another 1231 | 1232 | e.g: 1233 | ```sh 1234 | curl -v https://api.github.com/users/pgte/following/visionmedia 1235 | ``` 1236 | 1237 |


1238 | 1239 | # FAQ? 1240 | 1241 | ## Is *Crawling* a Website *Legal*...? 1242 | 1243 | The fact that scraping or "crawling" is Google's Business Model suggests that scraping is at least "OK" ... 1244 | 1245 | Started typing this into google and saw: 1246 | is-it-illegal-to 1247 | 1248 | I read a few articles and was not able to locate a definitive answer ... 1249 | 1250 | + Legal Issues: https://en.wikipedia.org/wiki/Web_scraping#Legal_issues 1251 | + It depends: http://resources.distilnetworks.com/h/i/53822104-is-web-scraping-illegal-depends-on-what-the-meaning-of-the-word-is-is/181642 1252 | + Screen scraping: How to profit from your rival's data: 1253 | http://www.bbc.com/news/technology-23988890 1254 | + Web Scraping For Fun and Profit: https://blog.hartleybrody.com/web-scraping/ 1255 | -------------------------------------------------------------------------------- /config/repos.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | const SELECTORS={ 4 | COMMIT:".Box-header--blue strong", 5 | LANGUAGES:".BorderGrid--spacious .BorderGrid-row", 6 | FORKED_FROM:'a[data-hovercard-type="repository"]', 7 | FOLLOWERS:'.Layout-main .d-table', 8 | TOPIC_TAG:".topic-tag", 9 | PROFILE:'div[itemtype="http://schema.org/Person"]' 10 | } 11 | 12 | module.exports = SELECTORS; -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | dwyl 2 | *.json 3 | -------------------------------------------------------------------------------- /examples/data/___next_page.txt: -------------------------------------------------------------------------------- 1 | /dwyl?page=2 2 | https://github.com/dwyl/aws-lambda-deploy/stargazers?after=Y3Vyc29yOnYyOpO0MjAxNi0wOC0yOVQwNDo0ODozNloAzgP5isg%3D 3 | https://github.com/dwyl/learn-nightwatch/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0wOC0yMFQxNjoyOTowMFoAzgrqe7Q%3D 4 | https://github.com/dwyl/english-words/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMy0yOFQyMToyOTozOVoAzgymXrc%3D 5 | https://github.com/dwyl/learn-elm/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMS0yOFQxMDozMzozOVoAzgwqTrg%3D 6 | /dwyl/learn-to-send-email-via-google-script-html-no-server/watchers?page=2 7 | https://github.com/dwyl/phoenix-liveview-counter-tutorial/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0wNi0wN1QyMDo1Mjo1MFoAzgpVseM%3D 8 | https://github.com/dwyl/learn-aws-lambda/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMi0wOFQxMTo1MTo0OVoAzgvJ_MM%3D 9 | https://github.com/dwyl/aws-sdk-mock/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMy0wMlQwOTo1NTozMVoAzgxtXqE%3D 10 | /dwyl/learn-elm/watchers?page=2 11 | https://github.com/dwyl/hapi-auth-jwt2/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMS0yMFQxMjowNDo0NVoAzgulaBM%3D 12 | https://github.com/dwyl/phoenix-ecto-encryption-example/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMC0xNlQxMTo0MDoyNFoAzgtdNa8%3D 13 | https://github.com/dwyl/learn-to-send-email-via-google-script-html-no-server/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMy0xNVQwMDoxNzo0MVoAzgyIV5U%3D 14 | https://github.com/dwyl/learn-phoenix-framework/stargazers?after=Y3Vyc29yOnYyOpO0MjAxOS0xMS0yMlQwMzowMDo0MloAzgupDGk%3D 15 | https://github.com/dwyl/phoenix-chat-example/stargazers?after=Y3Vyc29yOnYyOpO0MjAyMC0wMS0wM1QwMzozMTowMloAzgv6dNQ%3D 16 | /dwyl/english-words/watchers?page=2 17 | /dwyl/learn-aws-lambda/watchers?page=2 18 | /dwyl/learn-nightwatch/watchers?page=2 19 | -------------------------------------------------------------------------------- /examples/get_profile.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs") 2 | const gs = require("../lib/switcher"); 3 | const url = "andrew" // "iteles" // a random username 4 | gs(url, function(err, data) { 5 | 6 | fs.writeFileSync(__dirname + "/" + url + ".json", JSON.stringify(data, null, 2)) 7 | console.log(data); // or what ever you want to do with the data 8 | }) 9 | -------------------------------------------------------------------------------- /examples/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /examples/list-repos.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var gs = require('../lib'); 3 | var stars = require('./stars-recursive-scrape-save.js'); 4 | var NEXT_PAGE_LIST = stars.NEXT_PAGE_LIST; // to be re-factored! 5 | 6 | var org = 'dwyl/'; 7 | gs(org, process_org_page); 8 | 9 | function process_org_page(err, data) { 10 | if(data && data.entries) { 11 | data.entries.forEach(function (repo) { 12 | stars(org + repo.name); 13 | }) 14 | if(data.next_page) { 15 | // gs(data.next_page, process_org_page); 16 | stars.save_next_page(data.next_page); 17 | } 18 | } 19 | else { 20 | console.log(data); 21 | } 22 | } 23 | 24 | function crawl_next() { 25 | fs.readFile(NEXT_PAGE_LIST, 'utf8', function (err, data) { 26 | if (err) { 27 | console.log(err); 28 | } 29 | else { 30 | var url = data.split('\n')[0]; 31 | var linesExceptFirst = data.split('\n').slice(1).join('\n'); 32 | fs.writeFile(NEXT_PAGE_LIST, linesExceptFirst); 33 | } 34 | if(url.indexOf('/dwyl?') > -1) { // org page 35 | gs(url, process_org_page); 36 | } 37 | else { 38 | stars(url); 39 | } 40 | }); 41 | } 42 | 43 | var interval = setInterval(function(){ 44 | crawl_next(); 45 | }, 2000); 46 | -------------------------------------------------------------------------------- /examples/stars-recursive-scrape-save.js: -------------------------------------------------------------------------------- 1 | // list of people who have starred a dwyl repository 2 | var gs = require('../lib'); 3 | var path = require('path'); 4 | var fs = require('fs'); 5 | var mkdirp = require('mkdirp'); 6 | 7 | // constants 8 | var TIMESTAMP = Date.now(); 9 | var GURL = 'https://github.com/'; 10 | var BASE_DIR = path.resolve('./', 'data') + '/'; 11 | console.log('BASE_DIR:', BASE_DIR ); 12 | var NEXT_PAGE_LIST = BASE_DIR + '___next_page.txt'; 13 | fs.openSync(NEXT_PAGE_LIST, 'a') // "touch" file to ensure it exists 14 | 15 | function main(url) { 16 | var DATA_DIR = path.normalize(BASE_DIR + url); // repository 17 | mkdirp.sync(DATA_DIR); // ensure the dir exists 18 | 19 | var p = ['stargazers', 'watchers']; 20 | // console.log('url.indexOf(p[0]) === -1 ', url.indexOf(p[0])) 21 | if(url.indexOf(p[0]) === -1 && url.indexOf(p[1]) === -1 ) { // url is base repo 22 | console.log('>>> ' + url) 23 | p.forEach(function(page) { 24 | gs(url + '/' + page, process_results); // start crawling stargazers 25 | }) 26 | } 27 | else { 28 | gs(url, process_results); 29 | } 30 | } 31 | 32 | function process_results(err, data) { 33 | if (err) { return console.log(err); } 34 | write_lines(data); 35 | if(data.next_page) { 36 | // gs(data.next_page, process_results); 37 | return save_next_page(data.next_page); 38 | } 39 | } 40 | 41 | function save_next_page(url) { 42 | var lines = fs.readFileSync(NEXT_PAGE_LIST).toString().split('\n'); 43 | if(lines.indexOf(url) === -1) { // ensure no duplicates 44 | fs.writeFileSync(NEXT_PAGE_LIST, lines.join('\n') + url + '\n'); 45 | } 46 | } 47 | 48 | 49 | function parse_file(filename) { 50 | var data = fs.readFileSync(filename).toString(); 51 | return data.split('\n').map(function (row) { 52 | if(row.length > 1) { 53 | var json_str = row.split(',')[1] 54 | var json = JSON.parse(json_str); 55 | return json.username; 56 | } 57 | }); 58 | } 59 | 60 | // write lines to file 61 | function write_lines(data) { 62 | var filepath = path.normalize(BASE_DIR + 63 | data.url.replace(GURL, '').split('?')[0]) + '.csv' 64 | 65 | fs.openSync(filepath, 'a') // "touch" file to ensure it exists 66 | var existing = parse_file(filepath); 67 | 68 | var rows = data.entries.map(function(entry) { 69 | if(existing.indexOf(entry.username) === -1) { 70 | console.log('entry', entry); 71 | return TIMESTAMP + ',' + JSON.stringify(entry); 72 | } 73 | }).filter(function (n) { return n != undefined }); // remove blanks 74 | 75 | if (rows.length > 0) { 76 | var str = rows.join('\n') + '\n'; // end file with new line 77 | return fs.appendFile(filepath, str, function (err, res) { 78 | console.log('wrote ' + data.entries.length + ' lines to: ' + filepath); 79 | }); 80 | } else { 81 | console.log('no new faces') 82 | } 83 | } 84 | 85 | module.exports = main; 86 | module.exports.save_next_page = save_next_page; 87 | module.exports.NEXT_PAGE_LIST = NEXT_PAGE_LIST; 88 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | require("env2")(".env"); 2 | const debug = require("./lambda/debug.js"); 3 | const gs = require('github-scraper'); 4 | 5 | exports.handler = function handler (event, context, callback) { 6 | console.log(event); 7 | console.log("Hi Friends!") 8 | debug(event); 9 | console.log('rawPath:', event.rawPath) 10 | 11 | const url = event.rawPath; 12 | gs(url, function(err, data) { 13 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ') 14 | console.log(data); 15 | 16 | return callback(null, data); 17 | }); 18 | } -------------------------------------------------------------------------------- /lambda/debug.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | require('env2')('.env'); 3 | const save = require('./s3.js').save; 4 | 5 | /** 6 | * `debug` is used to debug SNS notification events. 7 | * it only gets executed if the NODE_ENV is set to "test". 8 | * To save event data to S3 you will need to add AWS_S3_BUCKET to .env 9 | * see: github.com/dwyl/aws-ses-lambda/issues/12 10 | * @param {Object} event - the object we want to store on S3 11 | */ 12 | module.exports = function debug (event) { 13 | // console.log("process.env.NODE_ENV:", process.env.NODE_ENV); 14 | if (process.env.NODE_ENV === "test") { 15 | if(event.Records && !event.key) { 16 | event.key = "sns"; 17 | } 18 | save(event, function callback (error, data) { 19 | console.log("DEBUG - - - error:", error, " - - - data:"); 20 | console.log(data); 21 | console.log(" - - - - - - - - - - - - - - - - - - - - "); 22 | }); 23 | } 24 | }; 25 | -------------------------------------------------------------------------------- /lambda/http_request.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | require("env2")(".env"); // ensure JWT_SECRET environment variable is defined. 4 | const http = require('https'); // ALWAYS use TLS over the internets! 5 | const jwt = require('jsonwebtoken'); 6 | /** 7 | * simple_http_request is a bare-bones http request using node.js core http 8 | * see: https://nodejs.org/api/http.html#http_http_request_options_callback 9 | * @param {Object} json - the JSON data we want to send to the Phoenix App. 10 | * @param {Function} callback - a standard callback with error & response args 11 | * response is a JSON Object unless there is an error. No error handling yet ... 12 | */ 13 | 14 | module.exports = function simple_http_request (json, callback) { 15 | const options = { // the json data is included in the token! 😮 16 | headers: { 17 | 'Authorization': jwt.sign(json, process.env.JWT_SECRET), 18 | 'Accept': 'application/json' 19 | }, 20 | hostname: process.env.EMAIL_APP_URL, // e.g: phemail.herokuapp.com 21 | method: 'POST', // HTTP post sans body: stackoverflow.com/questions/4191593 22 | port: '443', 23 | path: '/api/sns' // the API endpoint that processes and stores SNS data 24 | } 25 | 26 | http.request(options, function (res) { 27 | let resStr = ''; 28 | res.setEncoding('utf8'); 29 | res.on('data', function (chunk) { 30 | resStr += chunk; 31 | }).on('end', function () { 32 | return callback(res.statusCode, JSON.parse(resStr)); 33 | }); 34 | }) 35 | // .on('error', (e) => { 36 | // console.error(`problem with request: ${e.message}`); 37 | // }) 38 | .end(); 39 | }; 40 | -------------------------------------------------------------------------------- /lambda/s3.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | require('env2')('.env'); 3 | const AWS = require('aws-sdk'); 4 | AWS.config.region = 'eu-west-1'; 5 | var s3 = new AWS.S3({params: {Bucket: process.env.AWS_S3_BUCKET}}); 6 | 7 | /** 8 | * `save` saves a JSON object to S3. 9 | * if you need to specify the file name, use `json.key` 10 | * @param {Object} json - the object we want to store on S3 11 | * @param {Function} callback - called once the file has been uploaded 12 | */ 13 | module.exports.save = function save (json, callback) { 14 | if (json) { 15 | const filename = json.key || 'event' 16 | const params = { 17 | Key: filename + '.json', 18 | Body: JSON.stringify(json), 19 | ContentType: 'application/json', 20 | ACL: 'public-read' 21 | }; 22 | 23 | s3.upload(params, function (err, data) { 24 | if (callback && typeof callback === "function") { 25 | return callback(err, data); 26 | } 27 | else { 28 | return data; 29 | } 30 | }); 31 | 32 | } else { 33 | return callback('ERROR: please provide json data'); 34 | } 35 | } 36 | 37 | /** 38 | * `get` retrieves and parses a JSON file from S3 39 | * this function is only used to test that the `save` method. 40 | * @param {String} key - the filename of the object to get from S3 41 | * @param {Function} callback - called once the file has been uploaded 42 | */ 43 | module.exports.get = function get (key, callback) { 44 | s3.getObject({Key: key}, function (error, data) { 45 | if (error) { 46 | return callback(error); 47 | } 48 | else { 49 | return callback(error, JSON.parse(data.Body.toString())); 50 | } 51 | }); 52 | }; 53 | -------------------------------------------------------------------------------- /lib/feed.js: -------------------------------------------------------------------------------- 1 | // var wreck = require('wreck'); 2 | // var parse = require('xml2js').parseString; 3 | // 4 | // /** 5 | // * feed method parses a given GitHub user's activity feed 6 | // * @param {Object} $ - cheerio object with DOM of page to be scraped 7 | // * @param {string} url - a valid GitHub feed url format: {username}.atom 8 | // * @param {function} callback - the callback we should call after scraping 9 | // * a callback passed into this method should accept two parameters: 10 | // * @param {objectj} error an error object (set to null if no error occurred) 11 | // * @param {object} repos - list of (Public) GitHub repositories (for the user) 12 | // */ 13 | // module.exports = function feed (url, callback) { 14 | // wreck.get(url, function (error, response, xml) { 15 | // if (error) { // || response.output && response.output.statusCode !== 200) { 16 | // return callback(404); 17 | // } 18 | // else { 19 | // var data = {entries : [], url: url}; 20 | // parse(xml.toString(), function(err, JSON) { 21 | // data.updated = JSON.feed.updated[0]; // when feed was last updated 22 | // JSON.feed.entry.map(function(item) { 23 | // // store only the date/time and action performed (space separated) 24 | // data.entries.push(item.published[0] + ' ' + item.title[0]._); 25 | // }) 26 | // return callback(error, data); 27 | // }); 28 | // } 29 | // }); 30 | // } 31 | -------------------------------------------------------------------------------- /lib/followers.js: -------------------------------------------------------------------------------- 1 | 2 | const selectors=require('../config/repos') 3 | 4 | /** 5 | * followers method parses a given GitHub user's followers/following/stars list 6 | * @param {Object} $ - cheerio object with DOM of page to be scraped 7 | * @param {string} url - a valid GitHub username or url e.g: /{username} 8 | * @param {function} callback - the callback we should call after scraping 9 | * a callback passed into this method should accept two parameters: 10 | * @param {objectj} error an error object (set to null if no error occurred) 11 | * @param {object} data - list of (Public) GitHub repositories (for the user) 12 | */ 13 | module.exports = function followers ($, url, callback) { 14 | console.log(url) 15 | var data = { entries : [], url: url}; 16 | data.type = url.match(/tab=following/) ? 'following' : 'followers'; 17 | 18 | // console.log('selectors.FOLLOWERS', selectors.FOLLOWERS); 19 | $(`${selectors.FOLLOWERS}`).each(function(i, el){ 20 | 21 | data.entries.push({ 22 | avatar: $(this).find('img.avatar-user').first().attr("src"), 23 | fullname: $(this).find('.Link--primary').first().text(), 24 | username: $(this).find('.Link--secondary').first().text() 25 | }) 26 | }) 27 | 28 | data = require('./next_page')($, data); // don't worry require is cached ;-) 29 | callback(null, data) 30 | } 31 | -------------------------------------------------------------------------------- /lib/http_request.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var http = require('https'); // ALWAYS use TLS over the internets! 4 | var bgRedBlack = '\x1b[41m\x1b[30m'; 5 | var RESET = '\x1b[0m'; // see: https://stackoverflow.com/a/41407246/1148249 6 | /** 7 | * simple_http_request is a bare-bones http request using node.js core http 8 | * see: https://nodejs.org/api/http.html#http_http_request_options_callback 9 | * the NPM request module is 3.6 Megabytes and offers v. little benefit ... 10 | * This code achieves the same in less than 1kb. less code = faster response. 11 | * @param {Object} path - the path (on GitHub) we want to "view" 12 | * @param {Function} callback - a standard callback with error & response args 13 | * response is a JSON Object unless there is an error. 14 | */ 15 | 16 | module.exports = function simple_http_request (path, callback) { 17 | 18 | var options = { 19 | headers: { 20 | 'Accept': 'text/html', 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36' 22 | }, 23 | hostname: 'github.com', 24 | port: '443', 25 | path: path 26 | } 27 | 28 | http.request(options, function (res) { 29 | var resStr = ''; 30 | var response; 31 | // console.log(res.statusCode); 32 | if (res.statusCode !== 200) { 33 | console.log(bgRedBlack, ' GOT ', res.statusCode, ' for ', options, RESET); 34 | return callback(res.statusCode); 35 | } 36 | 37 | res.setEncoding('utf8'); 38 | res.on('data', function (chunk) { 39 | // console.log(chunk); 40 | resStr += chunk; 41 | }).on('end', function () { 42 | return callback(res.statusCode, resStr); // return response as HTML! 43 | }); 44 | 45 | return true; 46 | }).end(); 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('./switcher') 2 | module.exports.issues_search = require('./issues_search') // easter egg 3 | -------------------------------------------------------------------------------- /lib/issue.js: -------------------------------------------------------------------------------- 1 | /** 2 | * profile method scrapes a given GitHub user profile 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {String} url - a valid GitHub issue url 5 | * @param {Function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {Object} error an error object (set to null if no error occurred) 8 | * @param {Object} data - the complete issue contents + meta data 9 | */ 10 | module.exports = function issue($, url, callback) { 11 | 12 | var data = { entries : [], labels : [], participants : [] }; 13 | data.url = url; 14 | // console.log($('.gh-header-title')); 15 | data.title = $('.gh-header-title').first().text().trim().split('\n')[0]; 16 | 17 | data.state = $('.State').first().text().trim(); 18 | data.author = $('.gh-header-meta .author').first().text().trim(); 19 | data.created = $('relative-time')[0].attribs.datetime; 20 | 21 | // labels 22 | $('.IssueLabel').each(function(){ 23 | data.labels.push($(this).attr('data-name')); 24 | }) 25 | data.labels = data.labels.filter(function(i) { return i != null }); 26 | // stackoverflow.com/questions/9229645/remove-duplicates-from-js-array 27 | data.labels = [...new Set(data.labels)] 28 | 29 | data.milestone = $('.Progress').next().text().trim(); 30 | data.assignee = $('.assignee').text().trim(); 31 | 32 | // participants anyone who has commented or been assigned in the issue 33 | $('.participant-avatar').each(function(){ 34 | data.participants.push($(this).attr('href').replace('/','')); 35 | }) 36 | // console.log(' - - - - - > data', data) 37 | // NOTE: this is possibly the most messed up DOM structure ever! 38 | // its almost as if someone @GitHub is deliberately trying to prevent crawlers 39 | 40 | 41 | var entries = $('.markdown-body'); 42 | console.log('entries.length', entries.length); 43 | 44 | const selector = '.markdown-body:nth-child(' + 1 + ')'; 45 | console.log('selector', selector); 46 | 47 | console.log($(selector).text().trim()); 48 | // console.log(entries[0]); 49 | 50 | for(var i=0; i < entries.length; i++) { 51 | 52 | // console.log(entries[i]); 53 | // var id = entries[i].attribs.id; // see: http://git.io/vOC5d 54 | // console.log(id); 55 | // var entry = {"id":id}; 56 | // entry.author = $('#'+id+' .author').attr('href').replace('/',''); 57 | // entry.created = $('#'+id+' time').attr('datetime'); 58 | // entry.body = $('#'+id+' .comment-body').first().text().trim(); 59 | // data.entries.push(entry); 60 | } 61 | return callback(null, data); 62 | 63 | } 64 | -------------------------------------------------------------------------------- /lib/issues.js: -------------------------------------------------------------------------------- 1 | /** UNCOMMENT THIS IF YOU HAVE TIME/PATIENTCE TO MAKE IT WORK ...! 2 | * issue method scrapes a given GitHub repository's issues list 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} url - a valid GitHub repository url in the format {user}/{project} 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {objectj} error an error object (set to null if no error occurred) 8 | * @param {object} data - list of (Public) GitHub issues (for the repo) 9 | 10 | module.exports = function issues ($, url, callback) { 11 | 12 | var data = { entries : [], url: url}; // the list we will return 13 | // meta data for the issues page 14 | var links = $('.table-list-header-toggle > a') 15 | // console.log(links); 16 | if(links.length === 0){ 17 | console.log(' - - - - - - short circuit (no links) - - - - - -') 18 | return callback(404); 19 | } 20 | data.open = parseInt(links['0'].children[2].data.trim().replace('Open', '').replace(/,/, ''), 10); 21 | data.closed = parseInt(links['1'].children[2].data.trim().replace('Open', '').replace(/,/, ''), 10); 22 | // extract all the issues on this page! 23 | var items = $('.table-list-item'); 24 | for(var i = 1; i < items.length + 1; i++) { 25 | var o = {}; // individual issue object 26 | var parent = '.table-list-item:nth-child(' +i +') '; 27 | o.url = $(parent + '.issue-title-link').first()['0'].attribs.href; 28 | o.title = $(parent + '.issue-title-link').first()['0'].children['0'].data.trim() 29 | o.created = $(parent + 'time')['0'].attribs.datetime 30 | o.author = $(parent + '.muted-link')['0'].children[0].data.trim(); 31 | o.comments = parseInt($(parent + '.issue-comments > a').first().text().trim(), 10); 32 | // assignee extraction only if assigned 33 | var img = $(parent + '.table-list-cell-avatar .tooltipped-n > img') 34 | if(img.length > 0) { 35 | o.assignee = img['0'].attribs.alt.replace('@','') 36 | } 37 | // milestone if one is set 38 | var milestone = $(parent + '.css-truncate-target'); 39 | if(milestone.length > 0) { 40 | o.milestone = milestone['0'].children[0].data.trim() 41 | } 42 | var labels = $(parent + '.labels > a'); 43 | var l = []; // only the label text! 44 | for(var j = 0; j < labels.length; j++) { 45 | l.push(labels[j].children[0].data.trim()); 46 | } 47 | o.labels = l; 48 | data.entries.push(o); 49 | } 50 | data = require('./next_page')($, data); // don't worry this gets cached ;-) 51 | return callback(null, data); 52 | 53 | } 54 | */ 55 | -------------------------------------------------------------------------------- /lib/issues_search.js: -------------------------------------------------------------------------------- 1 | // this will require using lib/http_request to be revived. 2 | var cheerio = require('cheerio'); 3 | 4 | var baseUrl = 'https://github.com'; 5 | var defaults = { 6 | "username" : "this", // username is kinda the point of the query! 7 | "query" : "author", // all issues created by the user (anywhere!) 8 | "state" : "open", // not too worried about the closed ones at first 9 | "order" : "desc", // newest first! 10 | "filter" : "created" // created date 11 | }; 12 | 13 | // function set_options(options) { 14 | // var keys = Object.keys(defaults); 15 | // keys.map(function(k){ 16 | // options[k] = options[k] || defaults[k]; 17 | // }) 18 | // return options; 19 | // } 20 | /** 21 | * format: https://github.com/search?type=Issues& 22 | * q={query}%3A{username}&state={state}&o={order}&s={filter} 23 | */ 24 | // function set_url(options) { 25 | // var url = baseUrl + '/search?type=Issues'; 26 | // url += '&q=' + options.query + '%3A' + options.username; 27 | // url += '&state=' + options.state; 28 | // url += '&o=' + options.order; 29 | // url += '&s=' + options.filter; 30 | // return url; 31 | // } 32 | 33 | /** UNCOMMENT THIS IF YOU HAVE TIME/PATIENTCE TO MAKE IT WORK ...! 34 | * issues_search method scrapes a given GitHub repository's issues list 35 | * @param {object} options - options for running the issue search 36 | * username - the GitHub username 37 | * query - 'mentions', 'assignee', 'author' or 'user' (defaults to author) 38 | * state - 'open' or 'closed' (defaults to open) 39 | * order - 'desc' or 'asc' descending / assending respectively (default desc) 40 | * filter - 'created', 'updated', 'comments' (used in conjunction with order) 41 | * see: README/issues>search 42 | * @param {function} callback - the callback we should call after scraping 43 | * a callback passed into this method should accept two parameters: 44 | * @param {objectj} error an error object (set to null if no error occurred) 45 | * @param {objects} list - list of (Public) GitHub issues (for the repo) 46 | 47 | module.exports = function issues_search (options, callback) { 48 | 49 | if(!callback || typeof options === 'function') { 50 | callback = options; 51 | return callback(400); 52 | } 53 | var url; 54 | if(options.next) { // if we are parsing the next page of results! 55 | url = baseUrl + options.next; 56 | } 57 | else { 58 | options = set_options(options); // apply defaults for any unset keys 59 | url = set_url(options); // generate search url 60 | } 61 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - search url:') 62 | console.log(url); 63 | wreck.get(url, function (error, response, html) { 64 | var list = { entries : [] }; // the list we will return 65 | var $ = cheerio.load(html); 66 | // console.log(html.toString()); 67 | var items = $('.issue-list-item'); 68 | for(var i = 1; i < items.length; i++) { 69 | var o = {}; 70 | var parent = '.issue-list-item:nth-child(' +i +') '; 71 | var a = $(parent + '.title > a').first(); 72 | o.title = a.text()// ['0'].title; 73 | o.url = a['0'].attribs.href; 74 | var re = new RegExp('\n', 'g'); 75 | o.desc = $(parent + '.description').first().text().replace(re, '').trim(); 76 | o.author = $(parent + '.issue-list-meta > li:nth-child(2) > a')['0'].attribs.title; 77 | o.created = $(parent + '.issue-list-meta > li:nth-child(2) > time')['0'].attribs.datetime; 78 | var coms = $(parent + '.issue-list-meta > li:nth-child(3) > strong')['0']; 79 | if(coms) { 80 | o.comments = parseInt(coms.children[0].data, 10); 81 | } 82 | list.entries.push(o); 83 | } 84 | var next = $('.next_page') 85 | if(next.length > 0) { 86 | list.next = next['0'].attribs.href; 87 | } 88 | 89 | return callback(error, list); 90 | }); 91 | } 92 | */ 93 | -------------------------------------------------------------------------------- /lib/labels.js: -------------------------------------------------------------------------------- 1 | /** 2 | * labels method scrapes a given GitHub repository's list of labels 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} url - a valid GitHub repository url {owner}/{reponame} 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {object} error - an error object (set to null if no error occurred) 8 | * @param {array} list - list of labels with colors for the GitHub repository 9 | */ 10 | function labels ($, url, callback) { 11 | var data = { entries: [], url: url }; 12 | var items = $('.table-list-item'); 13 | for(var i = 1; i < items.length; i++) { 14 | var parent = '.table-list-item:nth-child(' +i +') '; 15 | var link = $(parent + '.label-link')['0']; 16 | var label = { 17 | name : $(parent + '.label-name').first().text(), 18 | style : link.attribs.style.trim(), 19 | link : link.attribs.href, 20 | count : parseInt($(parent + '.label-description')['0'].children[0].data.replace('open issues', '').trim(), 10) 21 | } 22 | data.entries.push(label); 23 | } 24 | return callback(null, data); 25 | } 26 | 27 | module.exports = labels; 28 | -------------------------------------------------------------------------------- /lib/milestones.js: -------------------------------------------------------------------------------- 1 | /** 2 | * milestones method scrapes a given GitHub repository's list of milesontes 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} project - a valid GitHub repository name 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {object} error - an error object (set to null if no error occurred) 8 | * @param {array} list - list of milestones with colors for the GitHub repository 9 | */ 10 | 11 | function milestones ($, url, callback) { 12 | var data = { entries : [], url: url}; 13 | // .states gives us the number of open vs. closed milestones 14 | var states = $('.states > a'); 15 | // console.log(states[0].children[2].data); 16 | data.open = parseInt(states[0].children[2].data.replace('Open','').trim(), 10); 17 | data.closed = parseInt(states[1].children[2].data.replace('Closed','').trim(), 10); 18 | 19 | $('.table-list-item').each(function (i) { 20 | var milestone = { 21 | name : $(this).find('.milestone-title-link').first().text().trim(), 22 | due : $(this).find('.milestone-meta-item:nth-child(1)').text().trim(), 23 | updated : $(this).find('.milestone-meta-item:nth-child(2)').text().trim(), 24 | desc : $(this).find('.milestone-description-html').first().text().trim(), 25 | progress : $(this).find('.progress-percent').first().text().trim(), 26 | 27 | open : parseInt($(this).find('.stat:nth-child(2)').first().text().replace('open','').trim(), 10), 28 | closed : parseInt($(this).find('.stat:nth-child(3)').first().text().replace('closed','').trim(), 10), 29 | } 30 | data.entries.push(milestone); 31 | }); 32 | return callback(null, data); 33 | } 34 | 35 | module.exports = milestones; 36 | -------------------------------------------------------------------------------- /lib/next_page.js: -------------------------------------------------------------------------------- 1 | /** 2 | * next_page checks for pagination on a page 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {Object} data - the data we have scraped from the page so far 5 | * @return {Object} the data object with a next_page key & value 6 | */ 7 | module.exports = function next_page ($, data) { 8 | var next = $('.paginate-container').find('a').last().attr('href'); 9 | if(next) { 10 | data.next_page = next 11 | } 12 | else { 13 | data.next_page = '' 14 | } 15 | 16 | return data; 17 | } 18 | -------------------------------------------------------------------------------- /lib/next_page_beta.js: -------------------------------------------------------------------------------- 1 | /** 2 | * next_page checks for pagination on a "beta" page ref #131 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {Object} data - the data we have scraped from the page so far 5 | * @return {Object} the data object with a next_page key & value 6 | */ 7 | module.exports = function next_page_beta ($, data) { 8 | const next = $('.TablePaginationSteps').find('[class^="Pagination__Page-"]').last().attr('href'); 9 | data.next_page = ''; 10 | /* istanbul ignore else */ 11 | if (next) { 12 | const url = data.url.split('?')[0]; 13 | data.next_page = url + '?type=all&' + 'page=' + next.replace('#', ''); 14 | } 15 | 16 | return data; 17 | } 18 | -------------------------------------------------------------------------------- /lib/org.js: -------------------------------------------------------------------------------- 1 | /** 2 | * org method scrapes a given GitHub organisation 3 | * @param {string} orgname - a valid GitHub orgname 4 | * @param {function} callback - the callback we should call after scraping 5 | * a callback passed into this method should accept two parameters: 6 | * @param {objectj} error an error object (set to null if no error occurred) 7 | * @param {object} data - the complete organsiation data 8 | */ 9 | function org($, url, callback) { 10 | var data = { url: url, type: 'org' }; 11 | data.name = $('h1.lh-condensed').first().text().trim(); 12 | // data.description = $('h1.lh-condensed').parent().next().text().trim(); // yep ...¯\_(ツ)_/¯ 13 | data.description = $('.container-xl .color-fg-muted').first().text().trim() 14 | if($('span[itemprop=location]').length > 0){ 15 | data.location = $('span[itemprop=location]').first().text().trim(); 16 | } 17 | if($('.octicon-link').length > 0){ 18 | // console.log($('.octicon-link')); 19 | data.website = $('.octicon-link').next().text().trim(); 20 | } 21 | if($('a[itemprop=email]').length > 0){ 22 | data.email = $('a[itemprop=email]').first().text().trim(); 23 | } 24 | // var people = $('.Counter').eq(1); // people is *second* in list of tabs! 25 | // data.pcount = parseInt(people.first().text(), 10); 26 | // data.pcount = isNaN(data.pcount) ? 0 : data.pcount 27 | data.avatar = $('.avatar')[0].attribs.src; 28 | var parts = data.avatar.split('/'); 29 | data.uid = parseInt(parts[parts.length-1].split('?')[0], 10); 30 | // list of repos 31 | var items = $('li.Box-row'); 32 | // console.log('items.length', items.length); 33 | data.entries = []; // avoid having circular reference objects! :-( 34 | items.each( function (i) { // JS counters start at 0. 35 | var parent = 'li.Box-row:nth-child(' + (i+1) +') '; // CSS selectors start at 1. 36 | // console.log($(parent)) 37 | data.entries.push({ 38 | name: $(parent + ' a').first().text().trim(), 39 | lang: $(parent + 'span[itemprop=programmingLanguage]').first().text().trim(), 40 | url: $(parent + ' a').first().attr('href'), 41 | description: $(parent + 'p[itemprop=description]').first().text().trim(), 42 | updated: $(parent + ' relative-time')[0].attribs.datetime 43 | }); 44 | }); 45 | 46 | data = require('./next_page')($, data); // don't worry this gets cached ;-) 47 | callback(null, data); 48 | } 49 | 50 | module.exports = org 51 | -------------------------------------------------------------------------------- /lib/org_repos.js: -------------------------------------------------------------------------------- 1 | /** 2 | * `org_repos` parses a given GitHub organization repositories page. 3 | * e.g: https://github.com/orgs/dwyl/repositories?type=all 4 | * @param {object} $ - the cheerio DOM object. 5 | * @param {string} url - the url of the page to be parsed. 6 | * @param {function} callback - the callback we should call after scraping 7 | * a callback passed into this method should accept two parameters: 8 | * @param {objectj} error an error object (set to null if no error occurred) 9 | * @param {object} data - the complete organsiation data 10 | */ 11 | function org_repos($, url, callback) { 12 | var data = { url: url, type: 'org_repos' }; 13 | data.name = $('h1.lh-condensed').first().text().trim(); 14 | // data.description = $('h1.lh-condensed').parent().next().text().trim(); // yep ...¯\_(ツ)_/¯ 15 | data.description = $('.container-xl .color-fg-muted').first().text().trim() 16 | // var people = $('.Counter').eq(1); // people is *second* in list of tabs! 17 | // data.pcount = parseInt(people.first().text(), 10); 18 | // data.pcount = isNaN(data.pcount) ? 0 : data.pcount 19 | data.avatar = $('.avatar')[0].attribs.src; 20 | var parts = data.avatar.split('/'); 21 | data.uid = parseInt(parts[parts.length-1].split('?')[0], 10); 22 | // list of repos 23 | var items = $('li.listviewitem'); 24 | // console.log('items.length', items.length); 25 | data.entries = []; // avoid having circular reference objects! :-( 26 | items.each( function (i) { // JS counters start at 0. 27 | // console.log(i) 28 | var parent = 'li:nth-child(' + (i+1) +') '; // CSS selectors start at 1. 29 | console.log($(parent)) 30 | console.log($(parent + ' .markdown-title')) 31 | data.entries.push({ 32 | // feel free to add more attributes to this! 🙏 33 | name: $(parent + ' .markdown-title').text().trim(), 34 | // lang: $(parent + ' .listview-item-main-content').find('[class^="Text-"]').text().trim(), 35 | url: $(parent + ' a').first().attr('href'), 36 | description: $(parent + ' .repos-list-description').first().text().trim(), 37 | // updated: $(parent + ' relative-time')[0].attribs.datetime 38 | }); 39 | }); 40 | // console.log(data) 41 | 42 | data = require('./next_page_beta')($, data); // don't worry this gets cached ;-) 43 | callback(null, data); 44 | } 45 | 46 | module.exports = org_repos 47 | -------------------------------------------------------------------------------- /lib/people.js: -------------------------------------------------------------------------------- 1 | /** 2 | * issue method scrapes a given GitHub organisation's PUBLIC People 3 | * @param {string} org - a valid GitHub organisation in the format {user}/{project} 4 | * @param {function} callback - the callback we should call after scraping 5 | * a callback passed into this method should accept two parameters: 6 | * @param {objectj} error an error object (set to null if no error occurred) 7 | * @param {object} data - list of People who have made their membership Public for the org 8 | */ 9 | module.exports = function stargazers ($, url, callback) { 10 | var data = { entries : [], url: url, type: 'people' }; 11 | 12 | $('#org-members-table img.avatar').each(function (i, el) { 13 | var src = el.attribs.src; 14 | var parts = src.split('/'); 15 | var uid = parseInt(parts[parts.length-1].split('?')[0], 10); 16 | data.entries.push({ 17 | avatar: src, 18 | uid: uid, 19 | username: el.attribs.alt.replace('@', '') 20 | }); 21 | }); 22 | 23 | data = require('./next_page')($, data); // don't worry this gets cached ;-) 24 | return callback(null, data); 25 | } 26 | -------------------------------------------------------------------------------- /lib/profile.js: -------------------------------------------------------------------------------- 1 | 2 | const selectors = require('../config/repos') 3 | /** 4 | * profile method scrapes a given GitHub user profile 5 | * @param {string} username - a valid GitHub username 6 | * @param {function} callback - the callback we should call after scraping 7 | * a callback passed into this method should accept two parameters: 8 | * @param {objectj} error an error object (set to null if no error occurred) 9 | * @param {object} data - the complete GitHub Profile for the username 10 | */ 11 | module.exports = function profile ($, url, callback) { 12 | console.log($(`${selectors.PROFILE}`).first().find('.user-profile-bio').text()) 13 | var data = { url: url, type: 'profile' }; 14 | const tmpData=[] 15 | const stats=[] 16 | data.username = url.replace('/', ''); 17 | data.bio = $(`${selectors.PROFILE}`).first().find('.user-profile-bio').text(); 18 | data.avatar = $(`${selectors.PROFILE}`).first().find('.avatar-user').first().attr('src'); // avatar-user 19 | var parts = data.avatar.split('/'); 20 | data.uid = parseInt(parts[parts.length-1].split('?')[0], 10); 21 | 22 | data.repos = k_to_int($('.UnderlineNav .octicon-repo').first().next().text().trim()); 23 | data.projects = k_to_int($('.octicon-table').first().next().text().trim()); 24 | data.stars = k_to_int($('.octicon-star').next().text().trim()); // number of repositories user has starred 25 | data.followers = k_to_int($('.js-profile-editable-area .color-fg-default').first().text().trim()); 26 | data.following = k_to_int($('.js-profile-editable-area .color-fg-default').eq(1).text().trim()); 27 | 28 | // Pinned Repos 29 | 30 | var repos = $('.pinned-item-list-item') 31 | 32 | // console.log('repos: ', repos); 33 | data.pinned = []; 34 | repos.each(function (i) { 35 | data.pinned.push({ 36 | url: $(this).find('a.text-bold')[0]['attribs']['href'], 37 | // Want More? see: https://github.com/nelsonic/github-scraper/issues/78 38 | }) 39 | }); 40 | data.name = $('.vcard-fullname').text().trim(); // Full Name 41 | data.worksfor = $('.p-org').first().text().trim(); // Works for 42 | const location = $('li[itemprop=homeLocation]') 43 | if(location && location.attr('aria-label')) { 44 | data.location = location.attr('aria-label').replace("Home location: ", ''); 45 | } 46 | data.website = $('[data-test-selector=profile-website-url] > a').attr("href") 47 | // data.joined = $('.join-date').attr('datetime'); // Joined GitHub 48 | 49 | // Contributions to Open Source in the past 12 months #132 50 | // data.contribs = parseInt($('.js-yearly-contributions h2').text().trim() 51 | // .split(' contributions')[0].replace(',', ''), 10); 52 | // Contribution Matrix 53 | // data = require('./profile_contribs.js')($, data); 54 | 55 | // List of (Public) organizations from profile 56 | // data-hovercard-type="organization" 57 | var orgs = $('.avatar-group-item'); 58 | // console.log(orgs); 59 | data.orgs = {}; // https://github.com/nelsonic/github-scraper/issues/80 60 | orgs.each( function (i) { 61 | var url = orgs[i].attribs.href.replace('/', ''); 62 | data.orgs[url] = $(this).find('img')['0'].attribs.src; // org image 63 | }) 64 | 65 | // GitHub Developer Program member? 66 | var member = $('.octicon-cpu').parent().text().trim(); 67 | // yes this is always on the page but the hide it using CSS!! :-( 68 | var display = $('.bg-purple').parent().hasClass('d-none'); 69 | if(member && !display) { 70 | data.developerprogram = true; 71 | } 72 | callback(null, data); 73 | // add task to arana to scrape /{username}?tab=repositories after profile! 74 | } 75 | 76 | // transform '3.4k' to 3400 77 | function k_to_int(val) { 78 | // if (val === undefined) { 79 | // return 0; 80 | // } 81 | if (val.indexOf("k") > -1) { 82 | val = val.split("k")[0]; 83 | val = parseFloat(val); 84 | val = val * 1000; 85 | } 86 | val = parseInt(val); 87 | return Math.floor(val) 88 | } -------------------------------------------------------------------------------- /lib/profile_contribs.js: -------------------------------------------------------------------------------- 1 | /** 2 | * profile_contribs scrapes a user's GitHub Contribution Matrix 3 | * @param {Object} $ - a valid GitHub username 4 | * @param {Object} data - the complete GitHub Profile for the username 5 | * @returns {object} data - the complete GitHub Profile for the username 6 | */ 7 | module.exports = function profile($, data) { 8 | console.log(data) 9 | var c = $('.ContributionCalendar-day'); 10 | var matrix = {}; 11 | for(var i = 0; i < c.length; i++) { 12 | var e = c[i].attribs; // the entry 13 | 14 | var id = e.id.replace('contribution-day-component-','') 15 | // console.log(e.id, id) 16 | if (e['data-date']) { 17 | matrix[e['data-date']] = { 18 | fill: e['fill'], 19 | count: parseInt(e['data-count'], 10), 20 | x: e['data-ix'], 21 | y: id.split('-')[0] 22 | } 23 | } 24 | } 25 | // console.log(matrix) 26 | data.contrib_matrix = matrix; 27 | return data; 28 | } 29 | -------------------------------------------------------------------------------- /lib/repo.js: -------------------------------------------------------------------------------- 1 | const parse_int = require('../lib/utils').parse_int; 2 | const selectors=require('../config/repos') 3 | 4 | /** 5 | * repo method scrapes a given GitHub repository page 6 | * @param {Object} $ - cheerio object with DOM of page to be scraped 7 | * @param {string} project - a valid GitHub repository name 8 | * @param {function} callback - the callback we should call after scraping 9 | * a callback passed into this method should accept two parameters: 10 | * @param {object} error - an error object (set to null if no error occurred) 11 | * @param {array} data - list of (Public) information on the GitHub repository 12 | */ 13 | function repo ($, url, callback) { 14 | 15 | var data = { "url" : url, type: 'repo'}; 16 | data.description = $('.Layout-sidebar .f4').first().text().trim(); 17 | data.website =$('.Layout-sidebar .octicon-link').parent().text().trim(); 18 | 19 | var badges = $('.social-count'); 20 | var forkedfrom = $(`${selectors.FORKED_FROM}`).text(); 21 | if (forkedfrom) { 22 | 23 | data.forkedfrom = forkedfrom; 24 | } 25 | 26 | data.tags = [] 27 | $(`${selectors.TOPIC_TAG}`) 28 | .each(function(i,a){ 29 | data.tags.push($(this).text().trim()) 30 | }) 31 | 32 | data.usedby = parse_int($('.hx_flex-avatar-stack').next().text().trim()); 33 | data.watchers = parse_int(strip($('.octicon-eye').parent().text().trim())); 34 | data.stars = parse_int(strip($('.Layout-sidebar .octicon-star').parent().text().trim())); 35 | data.forks = parse_int(strip($('.Layout-sidebar .octicon-repo-forked').parent().text().trim())); 36 | // Commits are now client-side rendered by React. 🤦‍♂️ 37 | // data.commits = parse_int($('.octicon-history').parent().text().trim()); 38 | // Branches failing ... https://github.com/nelsonic/github-scraper/issues/126 39 | // console.log($('.Layout-main .octicon-git-branch')) 40 | // data.branches = parse_int($('.Layout-main .octicon-git-branch').parent().next().text()); 41 | // data.releases = parse_int($('.octicon-tag').next().text()); 42 | 43 | data.langs = []; // languages used in the repo: 44 | $('.Layout-sidebar .list-style-none').last().find("a") 45 | .each(function(i,e){ 46 | data.langs.push({ 47 | name:$(this).find('span').first().text(), 48 | perc:$(this).find('span').last().text() 49 | }) 50 | 51 | }) 52 | // console.log(data) 53 | return callback(null, data) 54 | } 55 | 56 | module.exports = repo; 57 | 58 | function strip(str) { 59 | return str.split('\n')[0] 60 | } -------------------------------------------------------------------------------- /lib/repos.js: -------------------------------------------------------------------------------- 1 | /** 2 | * repo method scrapes a given GitHub user's repositories tab 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} url - a valid GitHub username 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {object} error an error object (set to null if no error occurred) 8 | * @param {object} data - list of (Public) GitHub repositories (for the user) 9 | */ 10 | module.exports = function repos ($, url, callback) { 11 | var data = { entries: [], url:url}; // store repos in array 12 | var items = $('.repo-list-item'); 13 | for(var i = 1; i < items.length; i++) { 14 | var r = {}; 15 | var parent = '.repo-list-item:nth-child(' +i +') '; 16 | var a = $(parent + '.repo-list-name > a').first()['0'] 17 | r.url = a.attribs.href; 18 | r.name = a.children[0].data.trim(); 19 | // see: http://stackoverflow.com/questions/7969414/ (find element by itemprop) 20 | r.lang = $(parent + '.repo-list-stats > span[itemprop="programmingLanguage"]').first().text().trim() 21 | r.desc = $(parent + '.repo-list-description').first().text().trim() 22 | r.info = $(parent + '.repo-list-info').first().text().trim() || '' 23 | r.stars = parseInt($(parent + '.octicon-star').parent().first().text().trim(), 10) 24 | r.forks = $(parent + '.octicon-git-branch').parent().first().text().trim() 25 | r.updated = $(parent + ' .repo-list-meta > time')[0].attribs.datetime 26 | data.entries.push(r); 27 | } 28 | return callback(null, data) 29 | } 30 | -------------------------------------------------------------------------------- /lib/repos_user.js: -------------------------------------------------------------------------------- 1 | /** 2 | * repo method scrapes a given GitHub user's repositories tab 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} url - a valid GitHub username 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {object} error an error object (set to null if no error occurred) 8 | * @param {object} data - list of (Public) GitHub repositories (for the user) 9 | */ 10 | module.exports = function repos ($, url, callback) { 11 | var data = { entries: [], url:url}; // store repos in array 12 | var items = $('.source'); 13 | // console.log('items.length:', items.length) 14 | for(var i = 1; i < items.length; i++) { 15 | var r = {}; 16 | var parent = '.source:nth-child(' + i +') '; 17 | // console.log(parent) 18 | var a = $('.wb-break-all > a', parent) 19 | if(a && a.length > 0) { 20 | a = a['0'] 21 | r.url = a.attribs.href; 22 | r.name = a.children[0].data.trim(); 23 | } 24 | // see: http://stackoverflow.com/questions/7969414/ (find element by itemprop) 25 | var lang = $(parent + 'span[itemprop="programmingLanguage"]'); 26 | 27 | if(lang && lang.length > 0) { 28 | r.lang = lang['0'].children[0].data 29 | } 30 | r.desc = $(parent + '.repo-list-description').first().text().trim() 31 | r.info = $(parent + '.repo-list-info').first().text().trim() || '' 32 | r.stars = parseInt($(parent + '.octicon-star').parent().first().text().trim(), 10) 33 | r.forks = $(parent + '.octicon-git-branch').parent().first().text().trim() 34 | var updated = $(parent + ' relative-time'); 35 | if (updated && updated.length > 0) { 36 | r.updated = updated['0'].attribs.datetime 37 | } 38 | 39 | data.entries.push(r); 40 | } 41 | return callback(null, data) 42 | } 43 | -------------------------------------------------------------------------------- /lib/scrapers.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | // feed: require('./feed'), // activity feed (RSS) 3 | followers: require('./followers'), // also scrapes following or stargazers 4 | issue: require('./issue'), 5 | // issues: require('./issues'), 6 | // issues_search: require('./issues_search'), 7 | // labels : require('./labels'), 8 | // milestones : require('./milestones'), 9 | org: require('./org'), 10 | org_repos: require('./org_repos'), 11 | people: require('./people'), 12 | profile: require('./profile'), 13 | repo: require('./repo'), 14 | // repos: require('./repos'), 15 | repos_user: require('./repos_user'), 16 | // starred: require('./starred') 17 | stars_watchers: require('./stars_watchers') 18 | } 19 | -------------------------------------------------------------------------------- /lib/starred.js: -------------------------------------------------------------------------------- 1 | /** 2 | * starred method scrapes a given GitHub user's starred repos list 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} url - a valid GitHub username 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {object} error an error object (set to null if no error occurred) 8 | * @param {object} data - list of (Public) GitHub repositories stared by user 9 | */ 10 | module.exports = function starred ($, url, callback) { 11 | var data = { entries : [], url: url }; 12 | $('.repo-list-name').each(function () { 13 | data.entries.push($(this).find('a').attr('href')); 14 | }); 15 | data = require('./next_page')($, data); // don't worry this gets cached ;-) 16 | return callback(null, data); 17 | } 18 | -------------------------------------------------------------------------------- /lib/stars_watchers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * followers method parses a given GitHub user's followers/following/stars list 3 | * @param {Object} $ - cheerio object with DOM of page to be scraped 4 | * @param {string} url - a valid GitHub username or url e.g: /{username} 5 | * @param {function} callback - the callback we should call after scraping 6 | * a callback passed into this method should accept two parameters: 7 | * @param {objectj} error an error object (set to null if no error occurred) 8 | * @param {object} data - list of (Public) GitHub repositories (for the user) 9 | */ 10 | module.exports = function stargazers_watchers ($, url, callback) { 11 | var data = { entries : [], url: url, type: 'stars' }; 12 | data.stars = $('.tabnav .Counter').text().trim() 13 | 14 | $('.list-style-none img.avatar').each(function (i, el) { 15 | var src = el.attribs.src; 16 | var parts = src.split('/'); 17 | var uid = parseInt(parts[parts.length-1].split('?')[0], 10); 18 | data.entries.push({ 19 | avatar: src, 20 | uid: uid, 21 | username: el.attribs.alt.replace('@', '') 22 | }); 23 | }); 24 | 25 | data = require('./next_page')($, data); // don't worry this gets cached ;-) 26 | callback(null, data) 27 | } 28 | -------------------------------------------------------------------------------- /lib/switcher.js: -------------------------------------------------------------------------------- 1 | var http_request = require('./http_request'); 2 | var cheerio = require('cheerio'); 3 | var validate = require('./url_validator'); 4 | var scrapers = require('./scrapers'); 5 | 6 | // Adding Colors to Terminal *Without* a Library/Module 7 | var bgRedBlack = '\x1b[41m\x1b[30m'; 8 | var bgGreenBlack = '\x1b[42m\x1b[30m'; 9 | var RESET = '\x1b[0m'; // see: https://stackoverflow.com/a/41407246/1148249 10 | /** 11 | * switcher is the brains of this module! 12 | * it decides which scraper to use for a given url 13 | * @param {string} url - a valid GitHub username 14 | * @param {function} callback - the callback we should call after scraping 15 | * a callback passed into this method should accept two parameters: 16 | * @param {object} error an error object (set to null if no error occurred) 17 | * @param {object} data - list of (Public) GitHub repositories (for the user) 18 | */ 19 | module.exports = function switcher (url, callback) { 20 | 21 | if(!callback || typeof callback !== 'function') { 22 | var msg = "GitHub Scraper is Asynchronous, callback is required!\n" 23 | msg += '\n - - - - - - - - - - - - - - - called by ' 24 | msg += arguments.callee.caller.toString() 25 | msg += ' - - - - - - - - - - - - - - - \n' 26 | throw "ERROR: " + __filename + ":17 \n" + msg; 27 | } 28 | 29 | var scraper; // the method we will use below 30 | if(!url || typeof url === 'undefined'){ 31 | return callback(404); 32 | } 33 | url = validate(url, callback); // ensure we 404 on undefined url 34 | // console.log('\n- - - - - - - - - - - - - - - - - - - - - - - - - - URL:') 35 | // console.log(url); 36 | // console.log('- - - - - - - - - - - - - - - - - - - - - - - - - - - - \n') 37 | // if(url.match(/\.atom/)) { // feed parser has its own request handler 38 | // return scrapers['feed'](url, callback); 39 | // } 40 | // centralised request issuer/hander 41 | http_request(url, function (status, html) { 42 | if (status !== 200 || !html) { 43 | console.log(bgRedBlack, 44 | " - - - GitHub Scraper SWITCHER FAIL >> " + url + " - - - ", RESET); 45 | // console.log(error, response.headers); 46 | // console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - -') 47 | callback(status); 48 | } 49 | else { 50 | var $ = cheerio.load(html); 51 | 52 | // in the case of username or orgname urls (which have exactly the same format!) 53 | // we need to fetch the page before we can tell which scraper to use 54 | if(url.match(/tab=repositories/)) { 55 | console.log('repos_user - - - - - - - - -') 56 | scraper = 'repos_user'; 57 | } 58 | // e.g: https://github.com/orgs/dwyl/repositories?type=all 59 | else if(url.match(/org/) && url.match(/repositories/)) { 60 | scraper = 'org_repos'; 61 | } 62 | else if(url.match(/followers|following/)) { 63 | scraper = 'followers'; // html/DOM is identical for these 2 pages! 64 | } 65 | else if(url.match(/stargazers|watchers/)) { 66 | scraper = 'stars_watchers'; // html/DOM is identical for these 2 pages! 67 | } 68 | else if(url.match(/people/) && $($('.octicon-person'))) { // org people 69 | scraper = 'people'; 70 | } 71 | else if($('.orghead').length > 0){ 72 | scraper = 'org'; 73 | } 74 | else if($('.h-card').length > 0) { 75 | // console.log('PROFILE!!') 76 | scraper = 'profile'; 77 | } 78 | // else if(url.match(/stars/)) { 79 | // scraper = 'starred'; 80 | // } 81 | // else if($('.commits').length > 0) { 82 | else { 83 | scraper = 'repo'; 84 | } 85 | // else if(url.match(/milestones/)) { 86 | // scraper = 'milestones'; 87 | // } 88 | // else if(url.match(/labels/)) { 89 | // scraper = 'labels'; 90 | // } 91 | if($('.issue').length > 0) { 92 | scraper = 'issue'; 93 | } 94 | 95 | // else { // else if(url.match(/issues/)) { 96 | // scraper = 'issues'; 97 | // } 98 | console.log(bgGreenBlack, 99 | " - - - GitHub Scraper >> "+url +" >> "+scraper + " - - - ", RESET); 100 | return scrapers[scraper]($, url, callback) 101 | } 102 | }); 103 | } 104 | -------------------------------------------------------------------------------- /lib/url_validator.js: -------------------------------------------------------------------------------- 1 | /** 2 | * url_validator does exactly what its name suggests validates a url 3 | * @param {String} url - a (hopefully) valid GitHub url 4 | * @param {Function} callback - the callback we should call after scraping 5 | * we are simply testing for presence of a callback and its typeof 'function' 6 | * @returns {String} - returns the validated url or throws error! 7 | */ 8 | module.exports = function validator (url, callback) { 9 | // console.log('\n- - - - - - - - - - - - - - - - - - - - - - - - - - URL:') 10 | // console.log(url); 11 | // check for existence of a callback function 12 | // check if the url was set 13 | if(!url || url.length === 0 || typeof url === 'undefined'){ 14 | return callback(400); 15 | } 16 | 17 | // add preceeding forward slash if not present 18 | if(url.charAt(0) !== '/' && url.indexOf('http') === -1) { 19 | url = '/' + url; 20 | } 21 | // strip github.com from url 22 | if(url.indexOf('github.com') !== -1) { // e.g: https://github.com/orgs/github/people?page=2 23 | url = url.split('https://github.com')[1]; 24 | } // eg: https://github.com/orgs/dwyl/people 25 | return url; 26 | } 27 | -------------------------------------------------------------------------------- /lib/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * `parse_int` parses a String e.g: 1.2k and returns an Int 1200 3 | * @param {String} str - the string to be parsed. e.g: "14.7k" 4 | * @return {Number} int - the integer representation of the String. 5 | */ 6 | function parse_int (str) { 7 | if (!str) { 8 | return 0; 9 | } 10 | 11 | return parseInt( 12 | str 13 | .trim() 14 | .replace(/\.(\d)k$/, "$100") // $1 match the digit \d 15 | .replace(/k$/, "000") 16 | .replace(/\.(\d)m$/, "$100000") // $1 match the digit \d 17 | .replace(/m$/, "000000") 18 | .replace(/[^0-9]/g, '') 19 | , 10) 20 | } 21 | 22 | module.exports = { 23 | parse_int: parse_int 24 | } 25 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "github-scraper", 3 | "version": "7.1.1", 4 | "description": "Parse data from GitHub Profiles, Repos and Orgs", 5 | "main": "lib/index.js", 6 | "scripts": { 7 | "dev": "nodemon test/followers.js", 8 | "start": "live-server --open=./examples", 9 | "quick": "./node_modules/tape/bin/tape ./test/*.js", 10 | "test": "./node_modules/.bin/istanbul cover ./node_modules/tape/bin/tape ./test/*.js | node_modules/tap-spec/bin/cmd.js", 11 | "coverage": "./node_modules/.bin/istanbul cover ./node_modules/tape/bin/tape ./test/*.js && istanbul check-coverage --lines 100 --branches 100", 12 | "deploy": "dpl" 13 | }, 14 | "repository": { 15 | "type": "git", 16 | "url": "https://github.com/nelsonic/github-scraper.git" 17 | }, 18 | "author": "Dyler Turden", 19 | "license": "ISC", 20 | "bugs": { 21 | "url": "https://github.com/nelsonic/github-scraper/issues" 22 | }, 23 | "homepage": "https://github.com/nelsonic/github-scraper", 24 | "engines": { 25 | "node": ">= 10" 26 | }, 27 | "dependencies": { 28 | "aws-sdk": "^2.1692.0", 29 | "cheerio": "^1.0.0", 30 | "env2": "^2.2.2", 31 | "github-scraper": "^7.1.1" 32 | }, 33 | "devDependencies": { 34 | "dpl": "^5.0.1", 35 | "istanbul": "^0.4.5", 36 | "jshint": "^2.11.0", 37 | "live-server": "^1.2.1", 38 | "mkdirp": "^3.0.1", 39 | "pre-commit": "1.2.2", 40 | "tap-spec": "^5.0.0", 41 | "tape": "^5.9.0" 42 | }, 43 | "pre-commit": [ 44 | "coverage" 45 | ], 46 | "files_to_deploy": [ 47 | "index.js", 48 | "package.json", 49 | "lambda/", 50 | "lib/", 51 | ".env" 52 | ], 53 | "lambda_memory": 256, 54 | "lambda_timeout": 42 55 | } 56 | -------------------------------------------------------------------------------- /test/e2e.test.js: -------------------------------------------------------------------------------- 1 | var gs = require('../lib/'); 2 | var test = require('tape'); 3 | 4 | test('Scrape a known PROFILE @alanshaw', function(t){ 5 | var user = 'alanshaw'; 6 | gs(user, function(err, data) { 7 | t.ok(data.developerprogram === true, '- @' + user + ' is a member of the "GitHub Developer Program"'); 8 | t.ok(data.followers > 100, '- @' + user + ' Has more than 100 followers'); 9 | t.ok(data.stars > 100, '- @' + user + ' Has starred more than 100 repos'); 10 | t.end() 11 | }) 12 | }) 13 | 14 | test('FOLLOWERS LIST for @iteles', function(t){ 15 | var url = 'iteles?tab=followers'; 16 | gs(url, function(err, data) { 17 | t.ok(data.entries.length > 42, url +' count: '+data.entries.length); 18 | t.end(); 19 | }) 20 | }) 21 | 22 | test('FOLLOWING LIST (SECOND PAGE) for @nelsonic', function(t){ 23 | var url = 'nelsonic?page=2&tab=following'; 24 | gs(url, function(err, data) { 25 | t.ok(data.entries.length > 10, url +' count: '+data.entries.length); 26 | t.end(); 27 | }) 28 | }) 29 | 30 | test.skip('STARRED repos for @iteles (multi-page)', function(t){ 31 | var username = 'stars/iteles'; 32 | gs(username, function(err, data) { 33 | // t.ok(data.repos.length === 20, 'first page of org has 20 repos: '+data.repos.length) 34 | t.ok(data.entries.length === 30, '@'+username +' has only "starred": '+data.entries.length +' repos (first page)'); 35 | t.ok(data.next_page.indexOf('page=2') > -1, '@'+username +' has multiple pages of starred repos'); 36 | gs(data.next_page, function(err2, data2){ 37 | console.log(data2.next_page) 38 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ') 39 | console.log(data2); 40 | t.ok(data2.next_page.indexOf('page=3') > -1, '@'+username +' has multiple pages of starred repos'); 41 | t.end(); 42 | }) 43 | }); 44 | }) 45 | 46 | test.skip('parse @iteles activity feed (expect recent activity)', function(t){ 47 | var user = 'iteles.atom'; 48 | gs(user, function(err, data) { 49 | t.ok(err === null, 'No error when parsing @' +user +' activity feed'); 50 | var entry = data.entries.filter(function(e){ 51 | return e.indexOf('commented'); 52 | }) 53 | t.ok(data.entries.length === 30, '@' +user +' activity feed contains 30 entries') 54 | t.end(); 55 | }) 56 | }); 57 | 58 | test.skip('Find the repo with most stars for a given user', function(t) { 59 | var user = 'iteles?tab=repositories'; 60 | gs(user, function(err, data) { 61 | console.log(data) 62 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ') 63 | data.entries.sort(function(a,b) { 64 | return b.stars - a.stars ; 65 | }); 66 | var repo = data.entries[0] 67 | t.ok(repo.stars > 42, '@' + user +' > ' +repo.name +' has ' + repo.stars +' stars!'); 68 | t.end(); 69 | }) 70 | }); 71 | 72 | 73 | test.skip('Scrape an ORG WITH a next_page of repositories (known data)', function(t){ 74 | var url = 'dwyl'; 75 | gs(url, function(err, data) { 76 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ') 77 | console.log(data); 78 | t.ok(data.entries.length > 19, 'org '+url + ' has ' +data.entries.length + ' repos.') 79 | t.ok(data.next_page === '/dwyl?page=2', 'dwyl has more than one page'); 80 | t.ok(data.pcount > 20 , '"pcount":'+data.pcount + ' (people in the company)'); 81 | t.end(); 82 | }); 83 | }); 84 | 85 | 86 | test.skip('find issue with most comments', function(t){ 87 | var project = '/dwyl/tudo/issues' 88 | gs(project, function(err, data) { 89 | t.ok(err === null, 'No Error when crawling ' +project +' issues'); 90 | 91 | data.entries.sort(function(a,b) { 92 | return b.comments - a.comments 93 | }) 94 | var issue = data.entries[0]; 95 | console.log('- - - - - - - - - - - issue with most comments in '+project) 96 | // console.log(issue) 97 | t.ok(issue.comments > 2, issue.title + ' has ' + issue.comments + ' comments!') 98 | t.ok(data.open > 5, 'repo: ' +project +' has ' + data.count + ' issues (ZERO)'); 99 | t.ok(data.closed > 5, 'repo: ' +project +' has ' +data.closed + ' CLOSED issues'); 100 | t.end(); 101 | }) 102 | }) 103 | 104 | test.skip('Crawl a REPOSITORY single language repo', function(t){ 105 | var project = 'nelsonic/practical-js-tdd'; 106 | gs(project, function(err, data) { 107 | console.log(data); 108 | t.ok(data.langs[0].indexOf('JavaScript') > -1, 'Language is: '+ data.langs) 109 | t.end(); 110 | }) 111 | }) 112 | 113 | test.skip('LABELS for dwyl/tudo/labels', function(t){ 114 | var url = '/dwyl/time/labels'; 115 | gs(url, function(err, list) { 116 | console.log(list); 117 | t.ok(err === null, 'No Error when crawling ' + url +' (repo pages)'); 118 | var question = list.entries.filter(function(item){ 119 | return item.name === 'question'; 120 | }) 121 | question = question[0]; 122 | t.ok(question.link === url+'/question', 'question.link is : '+question.link+ ' === ' +url+'/question'); 123 | t.ok(question.count > 1, 'question.count (number of open issues): '+question.count); 124 | t.ok(question.style.indexOf('#fff') > -1, 'question.styles are '+question.style); 125 | t.end(); 126 | }) 127 | }) 128 | 129 | test.skip('MILESTONSE for /dwyl/tudo/milestones', function(t){ 130 | var url = '/dwyl/tudo/milestones'; 131 | gs(url, function(err, data) { 132 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ') 133 | console.log(data); 134 | t.ok(err === null, 'No Error when crawling ' + url +' (repo pages)'); 135 | t.ok(data.open > 0, 'data.open '+data.open); 136 | t.ok(data.closed > 0, 'data.closed '+data.closed); 137 | t.end(); 138 | }) 139 | }) 140 | 141 | test.skip('ISSUE contens without milestone', function(t){ 142 | var url = '/dwyl/time/issues/154'; 143 | gs(url, function(err, data){ 144 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ') 145 | console.log(data); 146 | var d = data.entries.filter(function(item){ 147 | return item.id === 'issuecomment-104228711'; 148 | }) 149 | d = d[0] // there should only be one entry 150 | t.ok(data.state === 'Closed', url +' state is: ' + data.state) 151 | 152 | t.ok(d.body === 'I Love you!', url +' last comment is: - - - - - - - - > '+d.body); 153 | t.end() 154 | }); 155 | }) 156 | 157 | test.skip('ORG PEOPLE ', function(t){ 158 | var url = 'orgs/dwyl/people'; 159 | gs(url, function(err, data){ 160 | console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dwyl people:') 161 | console.log(data.entries.sort().join(', ')); 162 | 163 | t.ok(data.entries.indexOf('iteles') > -1, url +' has '+data.entries.length + ' people'); 164 | t.end() 165 | }); 166 | }) 167 | -------------------------------------------------------------------------------- /test/feed.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | // var feed = require('../lib/switcher'); 3 | 4 | test.skip('parse @iteles activity feed (expect recent activity)', function(t){ 5 | var user = 'iteles.atom'; 6 | feed(user, function(err, data){ 7 | t.ok(err === null, 'No error when parsing @' +user +' activity feed'); 8 | var entry = data.entries.filter(function(e){ 9 | return e.indexOf('commented'); 10 | }) 11 | t.ok(data.entries.length === 30, '@' +user +' activity feed contains 30 entries') 12 | t.end(); 13 | }) 14 | }) 15 | 16 | test.skip('Try to break it by supplying non-existent user', function(t){ 17 | var user = '' + Math.floor(Math.random() * 1000000000000000) + '.atom'; 18 | feed(user, function(err, data){ 19 | t.ok(err === 404, 'Got 404 Error when username does not exist'); 20 | // t.ok(data === null, 'No data for @' +user +' activity feed'); 21 | // t.ok(data.entries.length === 30, '@' +user +' activity feed contains 30 entries') 22 | t.end(); 23 | }) 24 | }) 25 | -------------------------------------------------------------------------------- /test/fixtures/dwyl-tudo-issue-51-api-comments.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url": "https://api.github.com/repos/dwyl/tudo/issues/comments/123807796", 4 | "html_url": "https://github.com/dwyl/tudo/issues/51#issuecomment-123807796", 5 | "issue_url": "https://api.github.com/repos/dwyl/tudo/issues/51", 6 | "id": 123807796, 7 | "user": { 8 | "login": "iteles", 9 | "id": 4185328, 10 | "avatar_url": "https://avatars.githubusercontent.com/u/4185328?v=3", 11 | "gravatar_id": "", 12 | "url": "https://api.github.com/users/iteles", 13 | "html_url": "https://github.com/iteles", 14 | "followers_url": "https://api.github.com/users/iteles/followers", 15 | "following_url": "https://api.github.com/users/iteles/following{/other_user}", 16 | "gists_url": "https://api.github.com/users/iteles/gists{/gist_id}", 17 | "starred_url": "https://api.github.com/users/iteles/starred{/owner}{/repo}", 18 | "subscriptions_url": "https://api.github.com/users/iteles/subscriptions", 19 | "organizations_url": "https://api.github.com/users/iteles/orgs", 20 | "repos_url": "https://api.github.com/users/iteles/repos", 21 | "events_url": "https://api.github.com/users/iteles/events{/privacy}", 22 | "received_events_url": "https://api.github.com/users/iteles/received_events", 23 | "type": "User", 24 | "site_admin": false 25 | }, 26 | "created_at": "2015-07-22T17:54:12Z", 27 | "updated_at": "2015-07-23T12:21:24Z", 28 | "body": "I'd love to test this out, this will be an amazing selling point if we can get the performance to work like we expect!" 29 | }, 30 | { 31 | "url": "https://api.github.com/repos/dwyl/tudo/issues/comments/124048121", 32 | "html_url": "https://github.com/dwyl/tudo/issues/51#issuecomment-124048121", 33 | "issue_url": "https://api.github.com/repos/dwyl/tudo/issues/51", 34 | "id": 124048121, 35 | "user": { 36 | "login": "nelsonic", 37 | "id": 194400, 38 | "avatar_url": "https://avatars.githubusercontent.com/u/194400?v=3", 39 | "gravatar_id": "", 40 | "url": "https://api.github.com/users/nelsonic", 41 | "html_url": "https://github.com/nelsonic", 42 | "followers_url": "https://api.github.com/users/nelsonic/followers", 43 | "following_url": "https://api.github.com/users/nelsonic/following{/other_user}", 44 | "gists_url": "https://api.github.com/users/nelsonic/gists{/gist_id}", 45 | "starred_url": "https://api.github.com/users/nelsonic/starred{/owner}{/repo}", 46 | "subscriptions_url": "https://api.github.com/users/nelsonic/subscriptions", 47 | "organizations_url": "https://api.github.com/users/nelsonic/orgs", 48 | "repos_url": "https://api.github.com/users/nelsonic/repos", 49 | "events_url": "https://api.github.com/users/nelsonic/events{/privacy}", 50 | "received_events_url": "https://api.github.com/users/nelsonic/received_events", 51 | "type": "User", 52 | "site_admin": false 53 | }, 54 | "created_at": "2015-07-23T10:20:15Z", 55 | "updated_at": "2015-07-23T10:20:15Z", 56 | "body": "@iteles have you watched the **Foundation** Episode featuring ***Kevin Systrom*** (***instagram***) ?\r\n> https://www.youtube.com/watch?v=nld8B9l1aRE\r\n\r\nWhat were the [**USP**](https://en.wikipedia.org/wiki/Unique_selling_proposition)s that contributed to ***instagram***'s success (_considering how many photo-related-apps were in the app store at the time_) ?\r\n\r\ncc: @besarthoxhaj " 57 | }, 58 | { 59 | "url": "https://api.github.com/repos/dwyl/tudo/issues/comments/124075792", 60 | "html_url": "https://github.com/dwyl/tudo/issues/51#issuecomment-124075792", 61 | "issue_url": "https://api.github.com/repos/dwyl/tudo/issues/51", 62 | "id": 124075792, 63 | "user": { 64 | "login": "besarthoxhaj", 65 | "id": 7887496, 66 | "avatar_url": "https://avatars.githubusercontent.com/u/7887496?v=3", 67 | "gravatar_id": "", 68 | "url": "https://api.github.com/users/besarthoxhaj", 69 | "html_url": "https://github.com/besarthoxhaj", 70 | "followers_url": "https://api.github.com/users/besarthoxhaj/followers", 71 | "following_url": "https://api.github.com/users/besarthoxhaj/following{/other_user}", 72 | "gists_url": "https://api.github.com/users/besarthoxhaj/gists{/gist_id}", 73 | "starred_url": "https://api.github.com/users/besarthoxhaj/starred{/owner}{/repo}", 74 | "subscriptions_url": "https://api.github.com/users/besarthoxhaj/subscriptions", 75 | "organizations_url": "https://api.github.com/users/besarthoxhaj/orgs", 76 | "repos_url": "https://api.github.com/users/besarthoxhaj/repos", 77 | "events_url": "https://api.github.com/users/besarthoxhaj/events{/privacy}", 78 | "received_events_url": "https://api.github.com/users/besarthoxhaj/received_events", 79 | "type": "User", 80 | "site_admin": false 81 | }, 82 | "created_at": "2015-07-23T11:59:31Z", 83 | "updated_at": "2015-07-23T11:59:31Z", 84 | "body": "@nelsonic love the idea! Let's do it!" 85 | } 86 | ] 87 | -------------------------------------------------------------------------------- /test/fixtures/dwyl-tudo-issue-51-api.json: -------------------------------------------------------------------------------- 1 | { 2 | "url": "https://api.github.com/repos/dwyl/tudo/issues/51", 3 | "labels_url": "https://api.github.com/repos/dwyl/tudo/issues/51/labels{/name}", 4 | "comments_url": "https://api.github.com/repos/dwyl/tudo/issues/51/comments", 5 | "events_url": "https://api.github.com/repos/dwyl/tudo/issues/51/events", 6 | "html_url": "https://github.com/dwyl/tudo/issues/51", 7 | "id": 96442793, 8 | "number": 51, 9 | "title": "Pre-fetch people's issues while they are authenticating with GitHub", 10 | "user": { 11 | "login": "nelsonic", 12 | "id": 194400, 13 | "avatar_url": "https://avatars.githubusercontent.com/u/194400?v=3", 14 | "gravatar_id": "", 15 | "url": "https://api.github.com/users/nelsonic", 16 | "html_url": "https://github.com/nelsonic", 17 | "followers_url": "https://api.github.com/users/nelsonic/followers", 18 | "following_url": "https://api.github.com/users/nelsonic/following{/other_user}", 19 | "gists_url": "https://api.github.com/users/nelsonic/gists{/gist_id}", 20 | "starred_url": "https://api.github.com/users/nelsonic/starred{/owner}{/repo}", 21 | "subscriptions_url": "https://api.github.com/users/nelsonic/subscriptions", 22 | "organizations_url": "https://api.github.com/users/nelsonic/orgs", 23 | "repos_url": "https://api.github.com/users/nelsonic/repos", 24 | "events_url": "https://api.github.com/users/nelsonic/events{/privacy}", 25 | "received_events_url": "https://api.github.com/users/nelsonic/received_events", 26 | "type": "User", 27 | "site_admin": false 28 | }, 29 | "labels": [ 30 | { 31 | "url": "https://api.github.com/repos/dwyl/tudo/labels/enhancement", 32 | "name": "enhancement", 33 | "color": "84b6eb" 34 | }, 35 | { 36 | "url": "https://api.github.com/repos/dwyl/tudo/labels/help%20wanted", 37 | "name": "help wanted", 38 | "color": "159818" 39 | }, 40 | { 41 | "url": "https://api.github.com/repos/dwyl/tudo/labels/question", 42 | "name": "question", 43 | "color": "cc317c" 44 | } 45 | ], 46 | "state": "open", 47 | "locked": false, 48 | "assignee": { 49 | "login": "besarthoxhaj", 50 | "id": 7887496, 51 | "avatar_url": "https://avatars.githubusercontent.com/u/7887496?v=3", 52 | "gravatar_id": "", 53 | "url": "https://api.github.com/users/besarthoxhaj", 54 | "html_url": "https://github.com/besarthoxhaj", 55 | "followers_url": "https://api.github.com/users/besarthoxhaj/followers", 56 | "following_url": "https://api.github.com/users/besarthoxhaj/following{/other_user}", 57 | "gists_url": "https://api.github.com/users/besarthoxhaj/gists{/gist_id}", 58 | "starred_url": "https://api.github.com/users/besarthoxhaj/starred{/owner}{/repo}", 59 | "subscriptions_url": "https://api.github.com/users/besarthoxhaj/subscriptions", 60 | "organizations_url": "https://api.github.com/users/besarthoxhaj/orgs", 61 | "repos_url": "https://api.github.com/users/besarthoxhaj/repos", 62 | "events_url": "https://api.github.com/users/besarthoxhaj/events{/privacy}", 63 | "received_events_url": "https://api.github.com/users/besarthoxhaj/received_events", 64 | "type": "User", 65 | "site_admin": false 66 | }, 67 | "milestone": { 68 | "url": "https://api.github.com/repos/dwyl/tudo/milestones/3", 69 | "html_url": "https://github.com/dwyl/tudo/milestones/Minimal%20Usable%20Product", 70 | "labels_url": "https://api.github.com/repos/dwyl/tudo/milestones/3/labels", 71 | "id": 1234895, 72 | "number": 3, 73 | "title": "Minimal Usable Product", 74 | "description": "What is the absolute minimum we can do to deliver value to people using the app?\r\n(and thus make them want to come back and use it!)", 75 | "creator": { 76 | "login": "nelsonic", 77 | "id": 194400, 78 | "avatar_url": "https://avatars.githubusercontent.com/u/194400?v=3", 79 | "gravatar_id": "", 80 | "url": "https://api.github.com/users/nelsonic", 81 | "html_url": "https://github.com/nelsonic", 82 | "followers_url": "https://api.github.com/users/nelsonic/followers", 83 | "following_url": "https://api.github.com/users/nelsonic/following{/other_user}", 84 | "gists_url": "https://api.github.com/users/nelsonic/gists{/gist_id}", 85 | "starred_url": "https://api.github.com/users/nelsonic/starred{/owner}{/repo}", 86 | "subscriptions_url": "https://api.github.com/users/nelsonic/subscriptions", 87 | "organizations_url": "https://api.github.com/users/nelsonic/orgs", 88 | "repos_url": "https://api.github.com/users/nelsonic/repos", 89 | "events_url": "https://api.github.com/users/nelsonic/events{/privacy}", 90 | "received_events_url": "https://api.github.com/users/nelsonic/received_events", 91 | "type": "User", 92 | "site_admin": false 93 | }, 94 | "open_issues": 5, 95 | "closed_issues": 0, 96 | "state": "open", 97 | "created_at": "2015-07-31T09:39:40Z", 98 | "updated_at": "2015-08-02T07:38:33Z", 99 | "due_on": "2016-07-05T23:00:00Z", 100 | "closed_at": null 101 | }, 102 | "comments": 3, 103 | "created_at": "2015-07-22T00:00:45Z", 104 | "updated_at": "2015-08-02T08:47:00Z", 105 | "closed_at": null, 106 | "body": "instead of waiting for people to perform the steps to authorise Tudo (to access their GitHub orgs/issues we *could* request their GitHub username on the login page and initiate the retrieval of their issues *while* they are authenticating... That way, by the time they get back to Tudo their issues dashboard is already pre-rendered and loaded! This is a ***wow-factor*** people won't be *expecting* and thus our app immediately delivers on our first promise!\r\n\r\nThoughts?", 107 | "closed_by": null 108 | } 109 | -------------------------------------------------------------------------------- /test/fixtures/dwyl-tudo-issue-51-scrape.json: -------------------------------------------------------------------------------- 1 | { entries: 2 | [ { id: 'issue-96442793', 3 | author: 'nelsonic', 4 | created: '2015-07-22T00:00:45Z', 5 | body: 'instead of waiting for people to perform the steps to authorise Tudo (to access their GitHub orgs/issues we could request their GitHub username on the login page and initiate the retrieval of their issues while they are authenticating... That way, by the time they get back to Tudo their issues dashboard is already pre-rendered and loaded! This is a wow-factor people won\'t be expecting and thus our app immediately delivers on our first promise!\n\nThoughts?' }, 6 | { id: 'issuecomment-123807796', 7 | author: 'iteles', 8 | created: '2015-07-22T17:54:12Z', 9 | body: 'I\'d love to test this out, this will be an amazing selling point if we can get the performance to work like we expect!' }, 10 | { id: 'issuecomment-124048121', 11 | author: 'nelsonic', 12 | created: '2015-07-23T10:20:15Z', 13 | body: '@iteles have you watched the Foundation Episode featuring Kevin Systrom (instagram) ?\n\n\nhttps://www.youtube.com/watch?v=nld8B9l1aRE\n\n\nWhat were the USPs that contributed to instagram\'s success (considering how many photo-related-apps were in the app store at the time) ?\n\ncc: @besarthoxhaj' }, 14 | { id: 'issuecomment-124075792', 15 | author: 'besarthoxhaj', 16 | created: '2015-07-23T11:59:31Z', 17 | body: '@nelsonic love the idea! Let\'s do it!' } ], 18 | labels: [ 'enhancement', 'help wanted', 'question' ], 19 | participants: [ 'nelsonic', 'iteles', 'besarthoxhaj' ], 20 | url: '/dwyl/tudo/issues/51', 21 | title: 'Pre-fetch people\'s issues while they are authenticating with GitHub', 22 | state: 'Open', 23 | author: 'nelsonic', 24 | created: '2015-07-22T00:00:45Z', 25 | milestone: 'Minimal Usable Product', 26 | assignee: 'besarthoxhaj' } -------------------------------------------------------------------------------- /test/fixtures/dwyl-tudo-issue-51.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | Pre-fetch people's issues while they are authenticating with GitHub · Issue #51 · dwyl/tudo · GitHub 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | Skip to content 74 |
75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 129 | 130 | 131 | 132 |
133 |
134 |
135 | 136 |
137 |
138 |
139 | 140 |
141 | 142 | 183 | 184 |

185 | 186 | /tudo 189 | 190 | 191 | 192 | 193 | 194 |

195 |
196 | 197 |
198 |
199 | 200 |
201 |
202 |
203 | 204 | 205 | 249 | 250 |
251 | 252 |
254 |

HTTPS clone URL

255 |
256 | 258 | 259 | 260 | 261 |
262 |
263 | 264 | 265 |
267 |

Subversion checkout URL

268 |
269 | 271 | 272 | 273 | 274 |
275 |
276 | 277 | 278 | 279 |
You can clone with 280 |
or
. 281 | 282 | 283 | 284 |
285 | 286 | 291 | 292 | Download ZIP 293 | 294 |
295 |
296 |
297 | 298 | 299 | 314 | 315 |
316 |
317 |

318 |

Loading…

319 |
320 | 321 |
322 | 323 | 324 |
329 | 330 |
331 | 336 | 337 |

338 | Pre-fetch people's issues while they are authenticating with GitHub 339 | #51 340 |

341 |
342 | 343 | 344 |
345 |
346 |
347 | 348 | Open 349 |
350 |
351 |
352 | nelsonic opened this Issue 353 | · 3 comments 354 |
355 |
356 |
357 | 358 | 359 |
360 |
361 |
365 |
366 | 367 | 383 | 384 | 398 | 399 | 413 | 414 | 415 |
419 |
420 | 421 |

422 | 3 participants 423 |

424 |
425 | @nelsonic 426 | @iteles 427 | @besarthoxhaj 428 |
429 |
430 |
431 | 432 | 433 | 434 | 435 |
436 | 437 |
438 | 439 |
440 | 441 | 442 |
443 | @nelsonic 444 |
447 | 448 |
449 | 450 | 451 | Owner 452 | 453 | 454 |
455 | 456 | 457 | nelsonic 458 | 459 | 460 | commented 461 | 462 | 463 | 464 | 465 |
466 |
467 | 468 | 469 |
470 | 471 |
472 |
473 |

instead of waiting for people to perform the steps to authorise Tudo (to access their GitHub orgs/issues we could request their GitHub username on the login page and initiate the retrieval of their issues while they are authenticating... That way, by the time they get back to Tudo their issues dashboard is already pre-rendered and loaded! This is a wow-factor people won't be expecting and thus our app immediately delivers on our first promise!

474 | 475 |

Thoughts?

476 |
477 |
478 | 479 | 480 |
481 |
482 | 483 |
484 | 485 | 486 | 487 | 488 |
489 |
490 | 491 | 492 | @nelsonic 493 | nelsonic 494 | added enhancement help wanted question labels 495 |
496 |
497 | 498 | 499 | 500 |
501 | @iteles 502 |
505 | 506 |
507 | 508 | 509 | Owner 510 | 511 | 512 |
513 | 514 | 515 | iteles 516 | 517 | 518 | commented 519 | 520 | 521 | 522 | 523 |
524 |
525 | 526 | 527 |
528 | 529 |
530 |
531 |

I'd love to test this out, this will be an amazing selling point if we can get the performance to work like we expect!

532 |
533 |
534 | 535 | 536 |
537 |
538 | 539 |
540 | 541 | 542 | 543 | 544 | 545 | 546 |
547 |
548 | 549 | 550 | @nelsonic 551 | nelsonic 552 | self-assigned this 553 |
554 |
555 | 556 | 557 | 558 |
559 | @nelsonic 560 |
563 | 564 |
565 | 566 | 567 | Owner 568 | 569 | 570 |
571 | 572 | 573 | nelsonic 574 | 575 | 576 | commented 577 | 578 | 579 | 580 | 581 |
582 |
583 | 584 | 585 |
586 | 587 |
588 |
589 |

@iteles have you watched the Foundation Episode featuring Kevin Systrom (instagram) ?

590 | 591 |
592 |

https://www.youtube.com/watch?v=nld8B9l1aRE

593 |
594 | 595 |

What were the USPs that contributed to instagram's success (considering how many photo-related-apps were in the app store at the time) ?

596 | 597 |

cc: @besarthoxhaj

598 |
599 |
600 | 601 | 602 |
603 |
604 | 605 |
606 | 607 | 608 | 609 | 610 | 611 | 612 |
613 |
614 | 615 | 616 | @nelsonic 617 | nelsonic 618 | removed their assignment 619 |
620 |
621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 |
630 |
631 | 632 | 633 | @besarthoxhaj 634 | besarthoxhaj 635 | was assigned by nelsonic 636 |
637 |
638 | 639 | 640 | 641 |
642 | @besarthoxhaj 643 |
646 | 647 |
648 | 649 | 650 | Collaborator 651 | 652 | 653 |
654 | 655 | 656 | besarthoxhaj 657 | 658 | 659 | commented 660 | 661 | 662 | 663 | 664 |
665 |
666 | 667 | 668 |
669 | 670 |
671 |
672 |

@nelsonic love the idea! Let's do it!

673 |
674 |
675 | 676 | 677 |
678 |
679 | 680 |
681 | 682 |
683 |
684 | 685 | 686 | @nelsonic 687 | nelsonic 688 | 689 | referenced 690 | this issue 691 | in nelsonic/github-scraper 692 | 693 | 694 | 695 |
696 | 697 | 698 | 699 | Merged 700 | 701 | 702 | 703 | 704 |

705 | 706 | Revive 707 | #4 708 |

709 | 710 |
711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 |
720 |
721 | 722 | 723 | @nelsonic 724 | nelsonic 725 | added this to the Minimal Viable Product milestone 726 |
727 |
728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 |
737 |
738 | 739 | 740 | @iteles 741 | iteles 742 | changed the title from Pre-fetch people's issues while they are authenticating with GiHub to Pre-fetch people's issues while they are authenticating with GitHub 743 |
744 |
745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 |
758 |
759 | 760 | 761 |
762 | 763 |
764 |
765 | Sign up for free 766 | to join this conversation on GitHub. 767 | Already have an account? 768 | Sign in to comment 769 |
770 | 771 | 772 |
773 |
774 | 775 |
776 |
777 |
778 | 779 |
780 | 781 | 782 |
783 |
784 | 785 |
786 |
787 | 788 | 789 |
790 | 791 |
792 | 815 |
816 | 817 | 818 |
819 |
820 |
821 | 822 |
823 |
824 |
825 |
826 |
827 |
828 | 829 | 830 | 831 | 833 | 834 | 835 |
836 |
837 | 838 | 839 | 840 | 841 | 842 | 843 |
844 | 845 | 846 | Something went wrong with that request. Please try again. 847 |
848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | -------------------------------------------------------------------------------- /test/followers.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var followers = require('../lib/switcher'); 3 | 4 | test('read list of followers for @hangouts (single page of followers) ', function (t) { 5 | const username = "hangouts"; 6 | const path = username + '?tab=followers'; 7 | followers(path, function(err, data) { 8 | t.equal(data.type, 'followers', username + ' data.type: ' + data.type); 9 | // console.log(data.entries.length) 10 | // console.log(data) 11 | t.ok(data.entries.length < 20, '@' + username + ' has ' + data.entries.length + ' followers'); 12 | const people = data.entries.map(e => e.username); 13 | t.ok(people.indexOf('giko') >-1, 'giko follows @' + username) 14 | t.ok(typeof data.next_page === 'string', '@' + username +' only has 1 page of followers'); 15 | t.end(); 16 | }); 17 | }) 18 | 19 | test('read list of followers for @iteles (multi-page)', function(t){ 20 | const username = "iteles"; 21 | const path = username + '?tab=followers'; 22 | followers(path, function(err, data) { 23 | // console.log(data.entries.length) 24 | // console.log(data) 25 | t.ok(data.entries.length === 50, '"followers": ' + data.entries.length + ' on page 1'); 26 | // console.log(' - - - - - - - - - - - - - data.next_page:'); 27 | // console.log(data.next_page); 28 | t.ok(data.next_page.indexOf('page=2&tab=followers') > -1, 29 | username +' multi-page followers'); 30 | // crawl second page of followers to confirm next_page is working: 31 | followers(data.next_page, function (err2, data2) { 32 | 33 | // console.log(err2, data2); 34 | t.ok(data2.entries.length === 50, '"followers": ' + data.entries.length); 35 | t.ok(data2.next_page.indexOf('page=3&tab=followers') > -1, 36 | username +' multi-page followers'); 37 | t.end(); 38 | }); 39 | }); 40 | }); 41 | 42 | // see: github.com/nelsonic/github-scraper/issues/60 43 | test.skip('Regression Test for issue #60', function(t){ 44 | var username = 'hangouts?tab=followers'; 45 | followers(username, function(err, data) { 46 | // console.log(username + ' has followers: ' + data.entries); 47 | t.ok(data.entries.length > 1, '"followers": '+data.entries.length); 48 | t.end(); 49 | }); 50 | }) 51 | -------------------------------------------------------------------------------- /test/following.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var following = require('../lib/switcher'); 3 | 4 | test('read list of following for @torvalds (zero people!)', function(t){ 5 | var url = 'torvalds?tab=following'; 6 | following(url, function (err, data) { 7 | t.equal(data.type, 'following', 'data.type is: ' + data.type); 8 | t.ok(data.entries.length === 0, '"following": '+data.entries.length); 9 | t.ok(typeof data.next_page === 'string', url 10 | +' has no "next_page" because he is not following anyone!'); 11 | t.end(); 12 | }); 13 | }) 14 | 15 | test('read list of following for @Marak (multi-page)', function(t){ 16 | var url = 'Marak?tab=following'; 17 | following(url, function (err, data) { 18 | t.equal(data.type, 'following', 'data.type is: ' + data.type); 19 | t.ok(data.entries.length === 50, '"following": '+data.entries.length); 20 | t.ok(data.next_page.indexOf('page=2') > -1, 21 | url +' multi-page followers'); 22 | // crawl second page: 23 | following(data.next_page, function (err2, data2) { 24 | t.ok(data2.entries.length > 20, '"following": '+data.entries.length); 25 | t.end(); 26 | }) 27 | }); 28 | }) 29 | -------------------------------------------------------------------------------- /test/http_request.test.js: -------------------------------------------------------------------------------- 1 | var http_request = require('../lib/http_request'); 2 | var test = require('tape'); 3 | var dir = __dirname.split('/')[__dirname.split('/').length-1]; 4 | var file = dir + __filename.replace(__dirname, '') + " > "; 5 | 6 | test('make GET request to invalid url (error branch check) EXPECT RED:', function (t) { 7 | var path = '/' + Math.floor(Math.random() * 1000000000000000); 8 | http_request(path, function (e, res) { 9 | t.equal(e, 404); 10 | t.end(); 11 | }); 12 | }); 13 | 14 | test('make GET request to invalid url (error branch check)', function (t) { 15 | var path = '/nelsonic' ; 16 | http_request(path, function (statusCode, html) { 17 | // console.log(statusCode, html); 18 | t.equal(statusCode, 200, 'statusCode for valid request is: ' + statusCode); 19 | t.ok(html.indexOf('') > -1, 'got html back from GitHub'); 20 | t.end(); 21 | }); 22 | }); 23 | 24 | // see: https://github.com/nelsonic/github-scraper/issues/60 25 | var validate = require('../lib/url_validator'); 26 | 27 | test('Regression Test for issue #60', function(t) { 28 | var path = '/hangouts?tab=followers'; 29 | http_request(path, function (statusCode, html) { 30 | t.equal(statusCode, 200, 'statusCode for valid request is: ' + statusCode); 31 | t.ok(html.indexOf('') > -1, 'got html back from GitHub'); 32 | t.end(); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /test/issue.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var issue = require('../lib/switcher'); 3 | 4 | test('Scrape /dwyl/tudo/issues/51 for comments & meta-data', function (t) { 5 | var url = '/dwyl/tudo/issues/51'; 6 | issue(url, function(err, data) { 7 | console.log(data) 8 | t.ok(data.url.indexOf(url) > -1, url + ' is: ' +data.url) 9 | t.ok(data.title.length > 0, url + ' has title: '+data.title); 10 | t.ok(data.state.length > 0, url + ' is: '+data.state); 11 | t.ok(data.author.length > 0, url + ' was authored by: '+data.author); 12 | t.ok(data.created.length > 0, url + ' was created on: '+data.created); 13 | // labels 14 | t.ok(data.labels.length > 2, url + ' has '+data.labels.length + ' labels') 15 | t.ok(data.milestone === 'Minimal Usable Product', 'Milestone is: ' 16 | + data.milestone); 17 | t.ok(data.assignee.length > 0, url + ' has assignee: '+ data.assignee); 18 | t.ok(data.participants.length > 2, url + ' has participants: ' 19 | + data.participants); 20 | t.ok(data.participants.indexOf('iteles') > -1, url 21 | + ' has participation from @iteles'); 22 | 23 | // t.ok(data.entries.length > 2, 24 | // url + ' has: '+data.entries.length + ' comments'); 25 | 26 | t.end(); 27 | }); 28 | }) 29 | 30 | test.skip('Scrape known issue without assignee', function(t) { 31 | var url ='/1602/compound/issues/20' 32 | issue(url, function(err, data){ 33 | t.ok(typeof data.assignee === 'undefined', "assignee is undefined") 34 | t.ok(data.state === 'Closed', url +' state is: ' + data.state) 35 | t.end() 36 | }); 37 | }) 38 | 39 | test.skip('Scrape known issue without milestone', function(t){ 40 | var url = '/dwyl/time/issues/154'; 41 | issue(url, function(err, data){ 42 | console.log(data); 43 | var d = data.entries.filter(function(item){ 44 | return item.id === 'issuecomment-104228711'; 45 | }) 46 | d = d[0] // there should only be one entry 47 | t.ok(data.state === 'Closed', url +' state is: ' + data.state) 48 | var dash = ' - - - - - - - - - - - - ' 49 | var easter_egg = '\n' + dash +'> '+ d.body +' <' + dash +'\n' 50 | t.ok(d.body === 'I Love you!', url +' last comment is: '+easter_egg); 51 | t.end() 52 | }); 53 | }) 54 | -------------------------------------------------------------------------------- /test/issues.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var issues = require('../lib/switcher'); 3 | 4 | test.skip('crawl known repository that has *many* issues ', function(t){ 5 | var project = '/dwyl/time/issues' 6 | issues(project, function(err, list) { 7 | t.ok(err === null, 'No Error when crawling ' +project +' issues'); 8 | console.log(list.entries.length); 9 | // t.ok(err === 404, 'Got 404 Error when username does not exist'); 10 | var count = list.entries.length; 11 | // first page should have 25 issues! 12 | t.ok(count === 25, 'repo: ' +project +' has ' +count + ' issues (non-zero) on (First Page)'); 13 | t.ok(list.open > 1, 'repo: ' +project +' has ' +list.open + ' OPEN issues (non-zero)'); 14 | t.ok(list.closed > 10, 'repo: ' +project +' has ' +list.closed + ' CLOSED issues'); 15 | // crawl the next page of issues: 16 | issues(list.next_page, function(err2, list2){ 17 | t.ok(list2.open > 10, 'repo: ' +project +' has ' +list.open + ' OPEN issues (non-zero)'); 18 | t.ok(list2.closed > 5, 'repo: ' +project +' has ' +list2.closed + ' CLOSED issues'); 19 | t.end(); 20 | }) 21 | }) 22 | }) 23 | 24 | test.skip('crawl known repository that only has a single page of issues ', function(t){ 25 | var project = '/dwyl/ignored/issues' 26 | issues(project, function(err, list) { 27 | console.log(list) 28 | t.ok(list.url.indexOf(project) > -1, '✓ url is set: '+list.url) 29 | t.ok(err === null, 'No Error when crawling ' +project +' issues'); 30 | var count = list.entries.length; 31 | t.ok(count === 0, 'repo: ' +project +' has ' +count + ' issues (ZERO)'); 32 | t.ok(list.closed > 5, 'repo: ' +project +' has ' +list.closed + ' CLOSED issues'); 33 | t.end(); 34 | }) 35 | }) 36 | 37 | // see: https://github.com/nelsonic/github-scraper/issues/53 38 | test.skip('crawl known repository (FORK) WITHOUT issues ', function(t){ 39 | var project = 'ladieswhocode/london-tech-event-hack-collection/issues' 40 | issues(project, function(err, list) { 41 | t.ok(err === 404, '✓ '+project +" has no issues >> HTTP Status: "+err) 42 | t.end(); 43 | }) 44 | }) 45 | 46 | 47 | var cheerio = require('cheerio') 48 | var issues2 = require('../lib/issues.js'); 49 | // see: https://github.com/nelsonic/arana/issues/16 50 | test.skip('Problem Child (Fork) Repo (MANUAL INVOCATION)', function(t){ 51 | var url = 'https://github.com/foundersandcoders/resolve-path' 52 | wreck.get(url, function (error, response, html) { 53 | var $ = cheerio.load(html); 54 | issues2($, url, function(err, data){ 55 | console.log(err, data) 56 | t.ok(err === 404, '✓ '+url +" Got "+err + " (as expected!)") 57 | t.end(); 58 | }) 59 | }); 60 | }) 61 | -------------------------------------------------------------------------------- /test/issues_search.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var issues_search = require('../lib/issues_search'); 3 | 4 | test.skip('expect 400 repo is not stated', function(t) { 5 | issues_search(function(err) { 6 | t.ok(err === 400, 'got 400 error when no options defined'); 7 | t.end(); 8 | }) 9 | }) 10 | 11 | test.skip('expect random (non-existent) repo to return zero results ', function(t){ 12 | var options = { username : Math.floor(Math.random() * 1000000000000000) } // a nice long "random" number 13 | issues_search(options, function(err, list){ 14 | // console.log(err, stats) 15 | t.ok(err === null, 'Search still returns a 200 with no results'); 16 | t.ok(list.entries.length === 0, 'no issues (as expected)'); 17 | t.end(); 18 | }) 19 | }) 20 | 21 | test.skip('scrape second page of results', function(t){ 22 | var options = { 23 | next: '/search?o=desc&p=2&q=author%3Aiteles&s=created&state=open&type=Issues' 24 | } 25 | issues_search(options, function(err, list){ 26 | // console.log(err, list) 27 | t.ok(err === null, 'Search still returns a 200 with no results'); 28 | t.ok(list.entries.length > 0, 'non-zero number of issues'); 29 | t.ok(list.entries[0].author === 'iteles', 'issue successfully scraped'); 30 | t.end(); 31 | }) 32 | }) 33 | // test for next page 34 | // 35 | -------------------------------------------------------------------------------- /test/labels.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var labels = require('../lib/switcher'); 3 | 4 | test.skip('crawl dwyl/tudo/labels', function(t){ 5 | var project = 'dwyl/tudo/labels'; 6 | labels(project, function(err, list) { 7 | console.log(list); 8 | t.ok(err === null, 'No Error when crawling ' + project +' (repo pages)'); 9 | var question = list.entries.filter(function(item){ 10 | return item.name === 'question'; 11 | }) 12 | question = question[0]; 13 | t.ok(question.link === '/dwyl/tudo/labels/question', 'question.link is : '+question.link); 14 | t.ok(question.count > 1, 'question.count (number of open issues): '+question.count); 15 | t.ok(question.style.indexOf('#fff') > -1, 'question.styles are '+question.style); 16 | t.end(); 17 | }) 18 | }) 19 | -------------------------------------------------------------------------------- /test/milestones.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var milestones = require('../lib/switcher'); 3 | 4 | test.skip('crawl /dwyl/tudo/milestones', function(t){ 5 | var project = '/dwyl/tudo/milestones'; 6 | milestones(project, function(err, data) { 7 | console.log(data); 8 | t.ok(err === null, 'No Error when crawling ' + project +' (repo pages)'); 9 | t.ok(data.open > 0, 'data.open '+data.open); 10 | t.ok(data.closed > 0, 'data.closed '+data.closed); 11 | t.end(); 12 | }) 13 | }) 14 | 15 | test.skip('/rethinkdb/rethinkdb has many milestones', function(t){ 16 | var project = '/rethinkdb/rethinkdb/milestones'; 17 | milestones(project, function(err, data) { 18 | // console.log(list); 19 | t.ok(err === null, 'No Error when crawling ' + project +' (repo pages)'); 20 | t.ok(data.open > 2, 'data.open '+data.open); 21 | t.ok(data.entries.length === data.open, project + " has " + data.entries.length +' open milestones' ) 22 | t.ok(data.closed > 0, project + ' closed milestones: '+data.closed); 23 | t.end(); 24 | }) 25 | }) 26 | -------------------------------------------------------------------------------- /test/org.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var org = require('../lib/switcher'); 3 | var dir = __dirname.split('/')[__dirname.split('/').length-1]; 4 | var file = dir + __filename.replace(__dirname, '') + " > "; 5 | 6 | test(file + 'Scrape an org WITHOUT a next page (known data)', function(t){ 7 | var url = '/peersun'; 8 | org(url, function(err, data) { 9 | t.equal(data.type, 'org', url + ' data.type: ' + data.type); 10 | t.ok(data.entries.length > 5, 'org ' 11 | + url + ' has ' + data.entries.length + ' repos.') 12 | // t.ok(data.pcount === 0, '"pcount":' + data.pcount); 13 | 14 | console.log(data) 15 | const last = data.entries[data.entries.length-1]; 16 | t.equal(last.updated, '2014-02-18T23:09:24Z', 17 | 'last.updated: ' + last.updated); 18 | // console.log(' - - - - - - - - - - - - - data.entries:'); 19 | // console.log(data.entries); 20 | // console.log(' - - - - - - - - - - - - -'); 21 | t.end(); 22 | }); 23 | }) 24 | 25 | test(file + 'Scrape an org WITH a next page', function(t){ 26 | var url = '/github'; 27 | org(url, function(err, data) { 28 | // delete(data.entries) 29 | console.log(err, data); 30 | // t.ok(data.pcount > 100, '"pcount":'+data.pcount); 31 | t.ok(data.location === 'San Francisco, CA', 'data.location: ' + data.location); 32 | t.ok(data.website === 'https://github.com/about', 'data.url: '+data.url); 33 | // t.ok(data.email === 'support@github.com', 'data.email: '+data.email); 34 | t.equal(data.uid, 9919, url + ' uid is ' + data.uid); 35 | t.end(); 36 | }); 37 | }) 38 | 39 | test(file + 'Fetch Second page of dwyl org', function (t) { 40 | let url = 'dwyl'; 41 | org(url, function(err, data) { 42 | // console.log(data.entries); 43 | t.ok(data.entries.length === 10, 'FIRST page of org has ' + data.entries.length + ' repos') 44 | // t.ok(data.pcount > 10, '"pcount":'+data.pcount); 45 | // t.ok(data.next_page === '/dwyl?page=2', 46 | // 'data.next_page is: ' + data.next_page); 47 | url = '/orgs/dwyl/repositories?type=all'; 48 | org(url, function(err, data) { 49 | console.log(data); 50 | t.ok(data.entries.length === 30, 'SECOND page of org has ' + data.entries.length + ' repos') 51 | // t.ok(data.pcount > 10, '"pcount":'+data.pcount); 52 | t.ok(data.next_page === '/orgs/dwyl/repositories?type=all&page=2', 'dwyl has more than one page'); 53 | t.end(); 54 | }); 55 | }); 56 | }) 57 | 58 | test(file + 'ORG with no people', function(t){ 59 | var url = '/pandajs'; 60 | org(url, function(err, data) { 61 | console.log('data', data); 62 | t.equal(data.description, 63 | "people who are super into pandas and javascript!", 64 | 'data.description: ' + data.description) 65 | t.ok(typeof data.website === 'undefined', "No website") 66 | t.ok(typeof data.location === 'undefined', "No location") 67 | t.ok(typeof data.email === 'undefined', "No email") 68 | // t.ok(data.pcount === 0, url + ' "pcount":'+data.pcount); 69 | t.end(); 70 | }); 71 | }) 72 | -------------------------------------------------------------------------------- /test/people.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var people = require('../lib/switcher'); 3 | 4 | test('Scrape org with single page of people', function(t){ 5 | var org = 'orgs/tableflip/people' 6 | people(org, function(err, data){ 7 | t.ok(data.entries.length > 5, 'There are '+data.entries.length +' people in '+org); 8 | const people = data.entries.map(e => e.username); 9 | t.ok(people.indexOf('alanshaw') >-1, 'Alan is a member of '+org) 10 | t.end(); 11 | }) 12 | }) 13 | 14 | test('Scrape org with multiple pages of people', function(t){ 15 | var org = 'orgs/github/people' 16 | people(org, function(err, data){ 17 | console.log(data.next_page); 18 | t.ok(data.entries.length > 20, 'There are '+data.entries.length +' people in '+org); 19 | t.ok(data.next_page === '/orgs/github/people?page=2', org +' has multiple pages of peeps!'); 20 | people(data.next_page, function(err2, data2){ 21 | t.ok(data2.next_page === '/orgs/github/people?page=3', org +' has multiple pages of peeps!'); 22 | t.end(); 23 | }) 24 | }) 25 | }) 26 | -------------------------------------------------------------------------------- /test/profile.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var profile = require('../lib/switcher'); 3 | var dir = __dirname.split('/')[__dirname.split('/').length-1]; 4 | var file = dir + __filename.replace(__dirname, '') + " > "; 5 | 6 | test(file + 'Scrape @nelsonic GitHub profile (consistent state profile)', function(t){ 7 | var user = 'nelsonic'; 8 | profile(user, function (err, data) { 9 | // console.log('data', data) 10 | // t.equal(data.type, 'profile', user + ' data.type: ' + data.type); 11 | t.ok(data.avatar.match(/githubusercontent.com\/u\/194400/) !== null, 12 | 'Image is what we expect: ' + data.avatar); 13 | t.ok(data.uid === 194400, '@' + user + ' has GitHub user_id: ' + data.uid); 14 | t.ok(data.username === 'nelsonic', '@' + user + ' username: ' + data.username); 15 | 16 | // t.ok(data.current > 400, 'Current Streak ' + data.current +' is over 500 days!'); 17 | t.ok(data.name === 'Nelson', 18 | '- @' + user + ' Name:' + data.name); 19 | t.ok(data.worksfor === '@dwyl', user + ' Works for ' + data.worksfor); 20 | // t.ok(data.email === 'contact.nelsonic+github@gmail.com', 21 | // '- @' + user + ' Email address is: contact.nelsonic@gmail.com'); 22 | // console.log('data.website:', data.website); 23 | t.ok(data.website === 'https://dwyl.com', 24 | user + ' Website URL is ' + data.website); 25 | // console.log(data.location); 26 | t.ok(data.location === 'Braga, Portugal', '- @' + user + ' Based in Braga, PT'); 27 | t.ok(data.followers > 400, '- @' + user + ' Has more than 400 followers'); 28 | t.ok(data.stars > 100, '- @' + user + ' Has starred '+ data.starred); 29 | 30 | t.ok(data.following > 300, '- @' + user 31 | + ' Is following more than 300 people'); 32 | // t.ok(data.contribs > 3000, '- @' + user + ' Has made ' + data.contribs 33 | // + ' contributions to Open Source this year!'); 34 | 35 | t.ok(data.pinned.length === 6, '- @' + user 36 | + ' Has Six "Pinned" Repositories'); 37 | 38 | t.ok(Object.keys(data.orgs).length > 2, '- @' + user + ' Is a member of ' 39 | + Object.keys(data.orgs).length + ' Orgs'); 40 | 41 | t.ok(data.developerprogram === true, '- @' 42 | + user + ' is a member of the "GitHub Developer Program"'); 43 | // regression: https://github.com/nelsonic/github-scraper/issues/79 44 | t.ok(data.stars > 2000, '- @' + user + ' Has starred ' + data.stars); 45 | 46 | t.end(); 47 | }); 48 | }); 49 | 50 | test(file + 'Check @torvalds IS NOT GitHub Dev Program Member', function(t){ 51 | var url = 'torvalds'; 52 | profile(url, function(err, data) { 53 | t.ok(typeof data.developerprogram === 'undefined', '- @' + url 54 | + ' is NOT a member of the "GitHub Developer Program"'); 55 | t.end(); 56 | }); 57 | }); 58 | 59 | test(file + 'Scrape @iteles detailed contribution matrix', function(t){ 60 | var user = 'iteles'; 61 | profile(user, function(err, data) { 62 | t.ok(data.bio.match(/Co-founder/i), '- @' + user + ' bio: ' + data.bio); 63 | // now client rendered so cannot parse! #132 64 | // t.ok(data.contribs > 100, '- @' + user + ' Has made ' + data.contribs 65 | // + ' contributions to Open Source this year!'); 66 | t.end(); 67 | }); 68 | }); 69 | 70 | test(file + '@dwylbot does not have a location!', function(t){ 71 | var url = 'dwylbot'; 72 | profile(url, function(err, data) { 73 | t.ok(typeof data.location === 'undefined', '- @' + url + ' is virtual!'); 74 | t.end(); 75 | }); 76 | }); 77 | -------------------------------------------------------------------------------- /test/repo.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var repo = require('../lib/switcher'); 3 | 4 | test('crawl known repository for stats', function(t) { 5 | var project = 'dwyl/adoro'; 6 | repo(project, function(err, stats) { 7 | // console.log(stats); 8 | t.equal(stats.type, 'repo', project + ' data.type: ' + stats.type); 9 | t.ok(err === null, 'No Error when crawling ' + project +' (repo pages)'); 10 | t.ok(stats.watchers > 3, ' has more than 1 watchers: '+stats.watchers); 11 | t.ok(stats.stars > 10, ' has more than 5 stars: '+stats.stars); 12 | t.ok(stats.forks > 0, ' has more than 0 forks: '+stats.forks); 13 | // t.ok(stats.branches > 0, ' has non-zero number of branches: ' + stats.branches); 14 | t.ok(stats.langs[0].name.indexOf('HTML') > -1, 'Language is: '+ stats.langs[0].name); 15 | t.end(); 16 | }) 17 | }) 18 | 19 | test('crawl single language repo', function (t) { 20 | var project = 'nelsonic/coin-change-ruby'; 21 | repo(project, function(err, stats) { 22 | const hasRuby=stats.langs.filter(e=>e.name==="Ruby") 23 | t.ok(hasRuby.length, 'Language is: '+ hasRuby[0].name) 24 | t.end(); 25 | }) 26 | }) 27 | 28 | test('crawl ZERO language repo', function(t){ 29 | var project = '/PeerSun/nodestack'; 30 | repo(project, function(err, stats) { 31 | t.ok(stats.langs.length === 0, 'Language is: '+ stats.langs +" (none)") 32 | t.end(); 33 | }) 34 | }) 35 | 36 | test('crawl forked repo', function(t){ 37 | var project = '/backhand/github-scraper'; 38 | repo(project, function(err, stats) { 39 | 40 | t.ok(stats.forkedfrom === 'nelsonic/github-scraper', 41 | 'Repo forked from /nelsonic/github-scraper') 42 | t.end(); 43 | }) 44 | }) 45 | 46 | test('crawl /dwyl/start-here (known repo)', function(t){ 47 | var project = '/dwyl/start-here'; 48 | repo(project, function(err, stats) { 49 | t.ok(stats.description.indexOf('Quick-start Guide') > -1, 50 | project + ' description: ' + stats.description); 51 | t.end(); 52 | }) 53 | }) 54 | 55 | test('dwyl/todo-list-javascript-tutorial known website', function (t) { 56 | var project = 'dwyl/javascript-todo-list-tutorial'; 57 | repo(project, function(err, stats) { 58 | // console.log('stats:', stats) 59 | t.ok(stats.website === 'dwyl.github.io/javascript-todo-list-tutorial', 60 | project + ' website: ' + stats.website); 61 | t.ok(stats.tags.indexOf('javascript') > -1, 62 | project + ' tags: ' + stats.tags); 63 | t.end(); 64 | }) 65 | }) 66 | 67 | test('crawl repo with lots of stars', function(t) { 68 | var project = 'angular/angular'; 69 | repo(project, function(err, stats) { 70 | t.ok(stats.watchers > 1000, ' has more than 1000 watchers: '+stats.watchers); 71 | t.ok(stats.stars > 1000, ' has more than 1000 stars: '+stats.stars); 72 | t.ok(stats.forks > 1000, ' has more than 1000 forks: '+stats.forks); 73 | // t.ok(stats.commits > 1000, ' has more than 1000 commits: '+stats.commits); 74 | t.end(); 75 | }); 76 | }); 77 | 78 | test('crawl repo with "Used by" metric issue #106', function(t) { 79 | const project = 'dwyl/decache'; 80 | repo(project, function(err, stats) { 81 | // console.log('stats', stats); 82 | // t.ok(stats.usedby > 25000, ' used by more than 25k: '+stats.usedby); 83 | t.ok(stats.stars > 100, ' has more than 1000 stars: '+stats.stars); 84 | t.ok(stats.forks > 10, ' has more than 1000 forks: '+stats.forks); 85 | // t.ok(stats.commits > 50, ' has more than 1000 commits: '+stats.commits); 86 | t.end(); 87 | }); 88 | }); 89 | -------------------------------------------------------------------------------- /test/repos.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var repositories = require('../lib/switcher'); 3 | 4 | test('crawl @iteles\' list of repositories (expect *many*!)', function(t){ 5 | var url = '/iteles?tab=repositories', repo; 6 | repositories(url, function(err, repos){ 7 | t.ok(err === null, 'No Error when crawling ' +url +' repos tab'); 8 | // console.log(repos) 9 | // console.log(' - - - - - - - - - - -') 10 | repo = repos.entries.filter(function(r) { 11 | return r.url === '/iteles/iteles.github.io'; 12 | }) 13 | repo = repo[0]; 14 | t.ok(repo.name === 'iteles.github.io', ' repos contains iteles.github.io'); 15 | t.ok(repo.stars > 0, ' repo iteles.github.io has non-zero number of stars: '+repo.stars); 16 | t.ok(repo.lang === 'HTML', ' repo ' + repo.url + ' is written in: '+repo.lang); 17 | t.end(); 18 | }) 19 | }) 20 | -------------------------------------------------------------------------------- /test/starred.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var starred = require('../lib/switcher'); 3 | 4 | test.skip('read list of starred repos for single page @lukebond (who never stars anything!) ', function(t){ 5 | var username = 'stars/lukebond'; 6 | starred(username, function(err, data) { 7 | // console.log(data); 8 | // t.ok(data.repos.length === 20, 'first page of org has 20 repos: '+data.repos.length) 9 | t.ok(data.entries.length < 10, '@'+username +' has only "starred": '+data.entries.length +' repos'); 10 | t.ok(typeof data.next_page === 'undefined', username +' has no "next page" (because he does not star anything!)'); 11 | t.end(); 12 | }); 13 | }) 14 | 15 | test.skip('read list of starred repos for single page @iteles (multi-page) ', function(t){ 16 | var username = 'stars/iteles'; 17 | starred(username, function(err, data) { 18 | // console.log(data) 19 | // t.ok(data.repos.length === 20, 'first page of org has 20 repos: '+data.repos.length) 20 | t.ok(data.entries.length === 30, '@'+username +' has only "starred": '+data.entries.length +' repos (first page)'); 21 | t.ok(data.next_page.indexOf('page=2') > -1, '@'+username +' has multiple pages of starred repos'); 22 | starred(data.next_page, function(err2, data2){ 23 | console.log(data2.next_page) 24 | t.ok(data2.next_page.indexOf('page=3') > -1, '@'+username +' has multiple pages of starred repos'); 25 | t.end(); 26 | }) 27 | }); 28 | }) 29 | -------------------------------------------------------------------------------- /test/stars.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var stars = require('../lib/switcher'); 3 | 4 | test('read list of stars for pandajs/sad ', function(t){ 5 | var url = 'pandajs/sad/stargazers'; 6 | stars(url, function (err, data) { 7 | t.equal(data.type, 'stars', url + ' data.type: ' + data.type); 8 | t.ok(data.entries.length > 0, '"stars": '+data.entries.length); 9 | const people = data.entries.map(e => e.username); 10 | t.ok(people.indexOf('nelsonic') >-1, 'Nelson starred '+ url) 11 | t.ok(data.next_page === '', url +' only has 1 page of stars'); 12 | t.end(); 13 | }); 14 | }) 15 | 16 | test('read list of stars for dwyl/learn-tdd (multi-page)', function(t){ 17 | var url = 'dwyl/learn-tdd/stargazers'; 18 | stars(url, function (err, data) { 19 | // console.log(data) 20 | t.equal(data.entries.length, 48, '"stars": '+data.entries.length); 21 | t.ok(data.next_page.match(/page=2/), url +' multi-page stargazers'); 22 | // crawl second page: 23 | stars(data.next_page, function(err2, data2) { 24 | t.equal(data2.entries.length, 48, '"stars": ' + data.entries.length); 25 | t.ok(data2.next_page.match(/page=3/), url +' multi-page stargazers'); 26 | t.end(); 27 | }) 28 | }); 29 | }) 30 | -------------------------------------------------------------------------------- /test/switcher.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var switcher = require('../lib/switcher'); 3 | var dir = __dirname.split('/')[__dirname.split('/').length-1]; 4 | var file = dir + __filename.replace(__dirname, '') + " > "; 5 | 6 | test(file + 'Attepmt to invoke the scraper WITHOUT VALID callback funciton', 7 | function(t) { 8 | var cberrmsg = "callback is required" 9 | try { 10 | switcher(); 11 | } catch (error){ 12 | // console.log(error); 13 | t.ok(error.indexOf(cberrmsg) > -1, "Got ERROR: "+error + " (as expected!)"); 14 | t.end(); 15 | } 16 | }) 17 | 18 | test(file + 'Force switcher error by not setting the url', function(t){ 19 | var url; 20 | switcher(url, function(err, data){ 21 | t.ok(err === 404, 'Got 404 Error when username does not exist'); 22 | t.end(); 23 | }) 24 | }) 25 | 26 | test(file + 'Try to break switcher by supplying non-existent user', function(t){ 27 | var url = '/' + Math.floor(Math.random() * 1000000000000000); 28 | switcher(url, function(err, data){ 29 | t.ok(err === 404, 'Got 404 Error when username does not exist'); 30 | t.end(); 31 | }) 32 | }) 33 | 34 | test.skip('Scrape a user profile supplying only the username', function(t){ 35 | var url = 'iteles' 36 | switcher(url, function(err, data) { 37 | t.ok(data.followercount > 40, '@'+url+'has '+data.followercount+' followers') 38 | console.log(' - - - - - - - - - - - - - - - - - - - - -') 39 | console.log(data); 40 | console.log(' - - - - - - - - - - - - - - - - - - - - -') 41 | t.end() 42 | }); 43 | }) 44 | 45 | test(file + 'Should correctly identify org repositories page', function(t){ 46 | var url = 'https://github.com/orgs/dwyl/repositories?type=all'; 47 | console.log('url:', url) 48 | switcher(url, function(err, data) { 49 | t.ok(data.type === 'org_repos') 50 | t.end() 51 | }) 52 | }) 53 | 54 | // var url = 'https://github.com/iteles/followers' 55 | // switcher(url, function(err, data){ 56 | // console.log(data); 57 | // }) 58 | 59 | // var url = 'https://github.com/alanshaw/david-www/stargazers' 60 | // switcher(url, function(err, data) { 61 | // console.log(data); 62 | // }); 63 | 64 | // var url = 'https://github.com/alanshaw/followers' 65 | // switcher(url, function(err, data) { 66 | // console.log(data); 67 | // }); 68 | 69 | // var url = 'dwyl' 70 | // switcher(url, function(err, data) { 71 | // console.log(data); 72 | // }); 73 | 74 | 75 | // var url2 = 'https://github.com/iteles/following?page=2' 76 | // switcher(url2, function(err, data){ 77 | // console.log(data); 78 | // }) 79 | // 80 | // 81 | // var url3 = 'https://github.com/iteles/following?page=2' 82 | // switcher(url3, function(err, data){ 83 | // console.log(data); 84 | // }) 85 | -------------------------------------------------------------------------------- /test/url_validator.test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'); 2 | var validate = require('../lib/url_validator'); 3 | 4 | test('Attempt to call scraper without a url (error test) ', function(t) { 5 | validate(null, function(err){ 6 | t.ok(err, 400, 'Receive 400 Error when url is null'); 7 | t.end(); 8 | }) 9 | }) 10 | 11 | test('Attempt to call scraper with blank url', function(t) { 12 | validate('', function(err){ 13 | t.ok(err, 400, 'Receive 400 Error when orgname is too short'); 14 | t.end(); 15 | }) 16 | }) 17 | 18 | test('Call scraper with url without leading forward slash', function(t) { 19 | var url = validate('iteles', function(err){ }); 20 | console.log(url) 21 | t.ok(url, 400, 'Receive 400 Error when orgname is too short'); 22 | t.end(); 23 | }) 24 | 25 | // see: https://github.com/nelsonic/github-scraper/issues/84 26 | test('url_validator does NOT contain (perfectly valid) url containing word "undefined"', function(t) { 27 | var url = validate('/undefined/followers'); 28 | var expected = '/undefined/followers'; 29 | t.equal(url, expected, 'User "@undefined" is legit: ' + url); 30 | t.end(); 31 | }); 32 | 33 | test('Call scraper with full (valid) GitHub URL', function(t) { 34 | var url = 'https://github.com/iteles' 35 | var expected = url.split('https://github.com')[1]; 36 | var actual = validate(url, function(err){ }); 37 | console.log(expected, actual) 38 | t.equal(expected, actual, 'No change to url'); 39 | t.end(); 40 | }) 41 | 42 | test('Confirm url validator transforms iteles/followers?page=2 into full url', function(t){ 43 | var url = 'iteles/followers?page=2' 44 | // var url1 = 'https://github.com/iteles/followers?page=2' 45 | var url2 = validate(url, function(err){ }); 46 | console.log(url, url2) 47 | t.ok('/' + url === url2, url + ' equal to: ' + url2); 48 | t.end(); 49 | }) 50 | 51 | // see: https://github.com/nelsonic/github-scraper/issues/60 52 | test('Regression Test for issue #60', function(t){ 53 | var url = 'hangouts/followers'; 54 | var actual = validate(url, function(err){ }); 55 | // console.log(url1, url2) 56 | t.ok('/' + url === actual, url + ' sucessfully transformed to: ' + actual); 57 | t.end(); 58 | }) 59 | -------------------------------------------------------------------------------- /test/utils.test.js: -------------------------------------------------------------------------------- 1 | const test = require('tape'); 2 | const parse_int = require('../lib/utils').parse_int; 3 | 4 | test('parse_int Parses Strings from repo stats into Ints', function(t) { 5 | t.equal(parse_int("1"), 1, '"1" => 1') 6 | t.equal(parse_int(" 1 "), 1, '" 1 " => 1') 7 | t.equal(parse_int("300"), 300, '"300" => 300') 8 | t.equal(parse_int("1k"), 1000, '"1k" => 1000') 9 | t.equal(parse_int("4.3k"), 4300, '"4.3k" => 4300') 10 | t.equal(parse_int("89.6k"), 89600, '"89.6k" => 89600') 11 | t.equal(parse_int("146k"), 146000, '"146k" => 146000') 12 | t.equal(parse_int("310k"), 310000, '"310k" => 310000') 13 | t.equal(parse_int("1m"), 1000000, '"1m" => 1000000') 14 | t.equal(parse_int("1.1m"), 1100000, '"1.1m" => 1100000') 15 | t.end() 16 | }) 17 | --------------------------------------------------------------------------------