├── .github └── ISSUE_TEMPLATE │ ├── bug-report.md │ └── feature_request.md ├── .gitignore ├── .gitlab-ci.yml ├── .npmignore ├── .travis.yml ├── CONTRIBUTING.md ├── FAQ.md ├── LICENSE ├── README.md ├── assets ├── logo.png └── logo.svg ├── docs ├── .gitignore ├── Gemfile ├── Gemfile.lock ├── _config.yml ├── _layouts │ └── default.html ├── api-change.md ├── assets │ └── img │ │ └── logo.png ├── capture.webm ├── favicon.ico └── index.md ├── examples ├── README.md ├── complexity.ts ├── package-lock.json ├── package.json ├── server.ts ├── tsconfig.json └── tslint.json ├── index.ts ├── man └── instamancer.1 ├── package-lock.json ├── package.json ├── plugins ├── README.md ├── index.ts ├── plugin.ts └── plugins │ ├── index.ts │ └── largeFirst.ts ├── src ├── api │ ├── api.ts │ ├── instagram.ts │ ├── postIdSet.ts │ ├── search.ts │ └── types.ts ├── cli.ts ├── getpool │ └── getPool.ts └── http │ ├── depot.ts │ ├── download.ts │ └── s3.ts ├── tests ├── __fixtures__ │ ├── FakePage.ts │ └── QuickGraft.ts ├── server.ts ├── test.spec.ts └── tsconfig.json ├── tsconfig.json ├── tslint.json └── utils └── validation-generator ├── .gitignore ├── README.md ├── generate.ts └── get-input.ts /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help Instamancer improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: ScriptSmith 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. 15 | 16 | - If the bug is related to the CLI, include the command you used. 17 | - If it's related to using the module, provide some sample code. 18 | - If it's related to the module itself, indicate the source of the problem. 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Output** 24 | If applicable, add Instamancer's output in a code block 25 | 26 | ``` 27 | here 28 | ``` 29 | 30 | **Setup (please complete the following information):** 31 | - OS: [e.g. Arch Linux, MacOS] 32 | - Instamancer version [e.g. v1.1.4] 33 | - Node version [e.g. v11.6.0] 34 | - NPM version (if applicable) [eg. 6.5.0] 35 | 36 | **Additional context** 37 | Add any other context about the problem here. 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | node_modules/ 3 | examples/node_modules/ 4 | coverage/ 5 | downloads/ 6 | *.map 7 | *.js 8 | *.d.ts 9 | *.tgz 10 | *.log 11 | *.csv 12 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | default: 2 | image: node:latest 3 | variables: 4 | CI: 1 5 | NO_SANDBOX: 1 6 | before_script: 7 | - npm install -g codacy-coverage 8 | 9 | - apt-get update 10 | - apt-get install -y wget gnupg 11 | - wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - 12 | - sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' 13 | - apt-get update 14 | - apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 --no-install-recommends 15 | - apt-get install -y xvfb 16 | - rm -rf /var/lib/apt/lists/* 17 | script: 18 | - npm install 19 | - npm run build -- --noEmit 20 | - xvfb-run --server-args="-screen 0 1024x768x24" npm run test:ci 21 | after_script: 22 | - cat ./coverage/lcov.info | codacy-coverage --language=typescript; 23 | artifacts: 24 | paths: 25 | - instamancer_tests.log 26 | expire_in: 1 week -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .github/ 3 | docs/ 4 | assets/ 5 | coverage/ 6 | test* 7 | .travis.yml 8 | ts*.json 9 | *.js.map 10 | *.log 11 | *.tgz 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "node" 4 | - "lts/*" 5 | dist: bionic 6 | addons: 7 | chrome: stable 8 | artifacts: 9 | paths: 10 | - $(ls *.log | tr "\n" ":") 11 | services: 12 | - xvfb 13 | before_install: 14 | # Enable user namespace cloning for pyppeteer 15 | - sysctl kernel.unprivileged_userns_clone=1 16 | # Launch XVFB for pyppeteer 17 | - export DISPLAY=:99.0 18 | install: 19 | - npm install -g codacy-coverage 20 | 21 | # Install instamancer and deps 22 | - npm install 23 | script: 24 | - npm run build -- --noEmit 25 | - npm run test:ci 26 | - if [[ $TRAVIS_PULL_REQUEST = "false" ]] ; then cat ./coverage/lcov.info | codacy-coverage --language=typescript; fi 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks! I'm glad for your interest in the project. Here are some guidelines: 4 | 5 | ## Bugs 6 | Create a [new issue](https://github.com/ScriptSmith/instamancer/issues/new) with the provided template and the `bug` label. 7 | 8 | ## Feature / pull requests 9 | Make sure you submit a new issue with the `feature` label before submitting a pull request. If you aren't sure whether a proposed change is possible / within the scope of the project, just ask. 10 | 11 | ## Chat / Questions 12 | Pop in to the [Gitter](https://gitter.im/instamancer) -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | ## Does it still work? 3 | At the time of writing, Instamancer still works. It's possible that it will break when Instagram.com is updated, or Instagram tries to curb this method of scraping. 4 | 5 | There is a daily Travis cron job which tests whether Instamancer is working as expected. You can see the results here: [![Build Status](https://img.shields.io/gitlab/pipeline/scriptsmith/instamancer)](https://gitlab.com/ScriptSmith/instamancer/pipelines) 6 | 7 | ## Is there a GUI? 8 | No, Instamancer only works from the command-line. In the future, I might implement a GUI using [Carlo](https://github.com/GoogleChromeLabs/carlo) or something more lightweight. 9 | 10 | There is a instagram data exploring tool in development here: [https://github.com/andyepx/insta-explorer](https://github.com/andyepx/insta-explorer) 11 | 12 | ## Do I need to log in? 13 | No. Instamancer scrapes data that Instagram makes publicly available. 14 | 15 | ## How quickly does it run? 16 | It can processes anywhere from 3-30 posts per second depending on configuration. 17 | 18 | ## Can I make it run faster? 19 | Running without the `--full` and `-d` arguments is faster. 20 | 21 | Not using `--sync` and customising the `-k` option can make downloading files quicker. 22 | 23 | Disabling grafting with `-g=false` will make the scraping quicker at the cost of not being able to access all posts (see [here](#what-happens-if-i-disable-grafting)). 24 | 25 | Setting `--sleep` to a decimal number below 1 speeds up page interactions at the cost of stability, as it makes you more likely to be rate limited. 26 | 27 | Scraping is not parallelisable (see [here](#can-i-run-multiple-instances-at-the-same-time-rather-than-batch-scraping)). 28 | 29 | Using `--plugin LargeFirst` is as much as 5x faster, but may result in undefined behavior. 30 | 31 | If you want something *really* fast, try [Instaphyte](https://github.com/ScriptSmith/instaphyte). It's as much as 12x faster. 32 | 33 | ## Can I run multiple instances at the same time rather than batch scraping? 34 | No. Instagram will probably rate-limit your IP address and then Instamancer will have to pause until the limit is lifted. 35 | 36 | ## What happens if I disable grafting? 37 | Chrome / Chromium will eventually decide that it doesn't want the page to consume any more resources and future requests to the API will be aborted. This usually happens between 5k-10k posts regardless of the memory available on the system. There doesn't seem to be any combination of Chrome flags to avoid this. 38 | 39 | ## How far back can I scrape? 40 | Seemingly as far as there are posts to scrape, but you can only reach old posts by scraping the most recent ones. 41 | 42 | ## How many posts can I scrape from a given endpoint? 43 | The most I've seen is more than 5 million. 44 | 45 | ## How do I scrape the first posts on the page? 46 | 47 | In the default configuration, Instamancer will skip the posts that are pre-loaded on the page. This is because it only retrieves posts generated from API requests, which aren't made for these posts. 48 | 49 | If you would like to retrieve these posts, then you should use full mode: `--full` or `-f`. 50 | 51 | This behavior may change in the future. 52 | 53 | ## How do I use the `--bucket` flag and S3? 54 | 1. Create an S3 bucket. Find help [here](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html). 55 | 2. Configure your AWS credentials. Find help [here](https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/loading-node-credentials-shared.html). 56 | 1. Ensure you can write to S3 with the credentials you're using. 57 | 3. Use instamancer like so: 58 | 59 | ``` 60 | instamancer ... -d --bucket=BUCKET_NAME 61 | ``` 62 | 63 | Where `BUCKET_NAME` is the name of the bucket. 64 | 65 | Example: 66 | 67 | ``` 68 | instamancer hashtag puppies -c10 -d --bucket=instagram-puppies 69 | ``` 70 | 71 | 72 | ## How do I use the `--depot` flag and depot? 73 | 1. Set up [depot](https://github.com/ScriptSmith/depot) 74 | 1. Set up basic access authentication if you're using a public server 75 | 2. Generate a UUIDv4 76 | 3. Use instamancer like so: 77 | 78 | ``` 79 | instamancer ... -d --depot=http://127.0.0.1:8080/jobs/UUID/ 80 | ``` 81 | 82 | Where `UUID` is the UUID you generated. 83 | 84 | Example: 85 | 86 | ``` 87 | instamancer hashtag puppies -c10 -d --depot=https://depot:password@depot-vlnbfvyaiq-uc.a.run.app/jobs/4cdc21fe-6b35-473a-b26e-66f62ad66c4c/ 88 | ``` 89 | 90 | You can use any server that accepts `PUT` requests. 91 | 92 | 93 | ## What does a batchfile look like? 94 | ``` 95 | hashtag spring -d --full 96 | hashtag summer -f=data.json 97 | user greg -c100 98 | ``` 99 | 100 | ## Why does the code have so many comments? 101 | Instamancer was originally part of another project written in Python that used the [Pyppeteer](https://github.com/miyakogi/pyppeteer) clone of Puppeteer. This version was too error-prone because of the complicated asyncio code and Pyppeteer's instability when communicating via websockets during long scraping jobs. 102 | 103 | I decided to rewrite Instamancer in TypeScript in order to be more stable and in-sync with Puppeteer. It was the first time I'd written any serious TypeScript or 'modern' JavaScript (promises, async/await etc.), so the zealous commenting helped me learn, and allowed me to figure out bugs in my algorithm and the grafting process. The comments aren't a permanent fixture and may be removed in a future commit. 104 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Adam Smith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

Instamancer

6 | 7 | [![Quality](https://img.shields.io/codacy/grade/98066a13fa444845aa3902d180581b86.svg)](https://app.codacy.com/project/ScriptSmith/instamancer/dashboard) 8 | [![Coverage](https://img.shields.io/codacy/coverage/98066a13fa444845aa3902d180581b86.svg)](https://app.codacy.com/project/ScriptSmith/instamancer/dashboard) 9 | [![Speed](https://firebasestorage.googleapis.com/v0/b/instagram-speed-test.appspot.com/o/instamancer.svg?alt=media&token=dcc3e623-ee88-4d74-ae86-2d969a1cd8ad)](https://scriptsmith.github.io/instagram-speed-test) 10 | [![NPM](https://img.shields.io/npm/v/instamancer.svg)](https://www.npmjs.com/package/instamancer) 11 | [![Dependencies](https://david-dm.org/scriptsmith/instamancer/status.svg)](https://david-dm.org/scriptsmith/instamancer) 12 | [![Chat](https://img.shields.io/gitter/room/instamancer/instamancer.svg)](https://gitter.im/instamancer) 13 | 14 | Scrape Instagram's API with Puppeteer. 15 | 16 | ###### [Install](#Install) | [Usage](#Usage) | [Comparison](#Comparison) | [Website](https://scriptsmith.github.io/instamancer/) | [FAQ](FAQ.md) | [Examples](examples/README.md) 17 | 18 |
19 | 20 | **Notice:** Instagram's Web UI and API now requires users to be logged in to access hashtag and account endpoints through a browser. As instamancer is designed to access publicly available data, it currently does not work as intended. Given that this change is unlikely to be reversed, Instamancer will remain unsupported and unmaintained indefinitely. Please use [this pinned issue](https://github.com/ScriptSmith/instamancer/issues/58) to discuss. 21 | 22 |
23 | 24 | 25 | Instamancer is a new type of scraping tool that leverages Puppeteer's ability to intercept requests made by a webpage to an API. 26 | 27 | Read more about how Instamancer works [here](https://scriptsmith.github.io/instamancer/). 28 | 29 | ### Features 30 | - Scrape hashtags, users' posts, and individual posts 31 | - Download images, albums, and videos 32 | - Output JSON, CSV 33 | - Batch scraping 34 | - Search hashtags, users, and locations 35 | - API response validation 36 | - Upload files to [S3](https://github.com/ScriptSmith/instamancer/blob/master/FAQ.md#how-do-i-use-the---bucket-flag-and-s3) and [depot](https://github.com/ScriptSmith/instamancer/blob/master/FAQ.md#how-do-i-use-the---depot-flag-and-depot) 37 | - [Plugins](plugins) 38 | 39 | ### Data 40 | Metadata that Instamancer is able to gather from posts: 41 | 42 | - Text 43 | - Timestamps 44 | - Tagged users 45 | - Accessibility captions 46 | - Like counts 47 | - Comment counts 48 | - Images (Thumbnails, Dimensions, URLs) 49 | - Videos (URL, View count, Duration) 50 | - Comments (Timestamp, Text, Like count, User) 51 | - User (Username, Full name, Profile picture, Profile privacy) 52 | - Location (Name, Street, Zip code, City, Region, Country) 53 | - Sponsored status 54 | - Gating information 55 | - Fact checking information 56 | 57 | ## Install 58 | 59 | #### Linux 60 | Enable user namespace cloning: 61 | ``` 62 | sysctl -w kernel.unprivileged_userns_clone=1 63 | ``` 64 | 65 | Or run without a sandbox: 66 | 67 | ``` 68 | # WARNING: unsafe 69 | export NO_SANDBOX=true 70 | ``` 71 | 72 | See [Puppeteer troubleshooting](https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md#chrome-headless-fails-due-to-sandbox-issues) 73 | 74 | #### Without downloading chromium 75 | If you wish to install Instamancer without downloading chromium, enable the `PUPPETEER_SKIP_CHROMIUM_DOWNLOAD` environment variable before installation 76 | 77 | ``` 78 | export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true 79 | ``` 80 | 81 | ### From NPM 82 | 83 | ``` 84 | npm install -g instamancer 85 | ``` 86 | 87 | If you're using root to install globally, use the following command to install the Puppeteer dependency 88 | 89 | ``` 90 | sudo npm install -g instamancer --unsafe-perm=true 91 | ``` 92 | 93 | ### From NPX 94 | 95 | ``` 96 | npx instamancer 97 | ``` 98 | 99 | ### From this repository 100 | ``` 101 | git clone https://github.com/ScriptSmith/instamancer.git 102 | cd instamancer 103 | npm install 104 | npm run build 105 | npm install -g 106 | ``` 107 | 108 | ## Usage 109 | 110 | ### Command Line 111 | ``` 112 | $ instamancer 113 | Usage: instamancer [options] 114 | 115 | Commands: 116 | instamancer hashtag [id] Scrape a hashtag 117 | instamancer user [id] Scrape a users posts 118 | instamancer post [ids] Scrape a comma-separated list of posts 119 | instamancer search [query] Perform a search of users, tags and places 120 | instamancer batch [batchfile] Read newline-separated arguments from a file 121 | 122 | Configuration 123 | --count, -c Number of posts to download (0 for all) [number] [default: 0] 124 | --full, -f Retrieve full post data [boolean] [default: false] 125 | --sleep, -s Seconds to sleep between interactions [number] [default: 2] 126 | --graft, -g Enable grafting [boolean] [default: true] 127 | --browser, -b Browser path. Defaults to the puppeteer version [string] 128 | --sameBrowser Use a single browser when grafting [boolean] [default: false] 129 | 130 | Download 131 | --download, -d Save images from posts [boolean] [default: false] 132 | --downdir Download path [default: "downloads/[endpoint]/[id]"] 133 | --video, -v Download videos (requires full) [boolean] [default: false] 134 | --sync Force download between requests [boolean] [default: false] 135 | --threads, -k Parallel download / depot threads [number] [default: 4] 136 | --waitDownload, -w Download media after scraping [boolean] [default: false] 137 | 138 | Upload 139 | --bucket Upload files to an AWS S3 bucket [string] 140 | --depot Upload files to a URL with a PUT request (depot) [string] 141 | 142 | Output 143 | --file, -o Output filename. '-' for stdout [string] [default: "[id]"] 144 | --type, -t Filetype [choices: "csv", "json", "both"] [default: "json"] 145 | --mediaPath, -m Add filepaths to _mediaPath [boolean] [default: false] 146 | 147 | Display 148 | --visible Show browser on the screen [boolean] [default: false] 149 | --quiet, -q Disable progress output [boolean] [default: false] 150 | 151 | Logging 152 | --logging, -l [choices: "none", "error", "info", "debug"] [default: "none"] 153 | --logfile Log file name [string] [default: "instamancer.log"] 154 | 155 | Validation 156 | --strict Throw an error on response type mismatch [boolean] [default: false] 157 | 158 | Plugins 159 | --plugin, -p Use a plugin from the plugins directory [array] [default: []] 160 | 161 | Options: 162 | --help Show help [boolean] 163 | --version Show version number [boolean] 164 | 165 | Examples: 166 | instamancer hashtag instagood -fvd Download all the available posts, 167 | and their media from #instagood 168 | instamancer user arianagrande --type=csv Download Ariana Grande's posts to a 169 | --logging=info --visible CSV file with a non-headless 170 | browser, and log all events 171 | 172 | Source code available at https://github.com/ScriptSmith/instamancer 173 | 174 | ``` 175 | 176 | ### Module 177 | 178 | ES2018 Typescript example: 179 | ```typescript 180 | import {createApi, IOptions} from "instamancer" 181 | 182 | const options: IOptions = { 183 | total: 10 184 | }; 185 | const hashtag = createApi("hashtag", "beach", options); 186 | 187 | (async () => { 188 | for await (const post of hashtag.generator()) { 189 | console.log(post); 190 | } 191 | })(); 192 | ``` 193 | 194 | #### Generator functions 195 | 196 | ```typescript 197 | import {createApi} from "instamancer" 198 | 199 | createApi("hashtag", id, options); 200 | createApi("user", id, options); 201 | createApi("post", ids, options); 202 | createApi("search", query, options); 203 | ``` 204 | 205 | #### Options 206 | ```typescript 207 | const options: Instamancer.IOptions = { 208 | // Total posts to download. 0 for unlimited 209 | total: number, 210 | 211 | // Run Chrome in headless mode 212 | headless: boolean, 213 | 214 | // Logging events 215 | logger: winston.Logger, 216 | 217 | // Run without output to stdout 218 | silent: boolean, 219 | 220 | // Time to sleep between interactions with the page 221 | sleepTime: number, 222 | 223 | // Throw an error if type validation has been failed 224 | strict: boolean, 225 | 226 | // Time to sleep when rate-limited 227 | hibernationTime: number, 228 | 229 | // Enable the grafting process 230 | enableGrafting: boolean, 231 | 232 | // Extract the full amount of information from the API 233 | fullAPI: boolean, 234 | 235 | // Use a proxy in Chrome to connect to Instagram 236 | proxyURL: string, 237 | 238 | // Location of the chromium / chrome binary executable 239 | executablePath: string, 240 | 241 | // Custom io-ts validator 242 | validator: Type, 243 | 244 | // Custom plugins 245 | plugins: IPlugin[] 246 | } 247 | ``` 248 | 249 | ## Comparison 250 | 251 | A comparison of Instagram scraping tools. Please suggest more tools and criteria through a pull request. 252 | 253 | To see a speed comparison, visit [this page](https://scriptsmith.github.io/instagram-speed-test) 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 |
ToolHashtagsUsersTagged postsLocationsPostsStoriesLogin not requiredPrivate feedsBatch modePluginsCommand-lineLibrary/ModuleDownload mediaDownload metadataScraping methodDaily buildsMain languageSpeed ____________________________License ____________________________Last commit ____________________________Open Issues ____________________________Closed Issues ____________________________Build status ____________________________Test coverage ____________________________Code quality ____________________________
Instamancer:heavy_check_mark::heavy_check_mark::x::x::heavy_check_mark::x::heavy_check_mark::x::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark:Web API request interception:heavy_check_mark:Typescript
Instaphyte:heavy_check_mark::x::x::x::x::x::heavy_check_mark::x::x::x::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark:Web API simulation:heavy_check_mark:Python
Instaloader:heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::x::x::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark:Web API simulation:x:Python:question::question:
Instalooter:heavy_check_mark::heavy_check_mark::x::heavy_check_mark::heavy_check_mark::x::x::heavy_check_mark::heavy_check_mark::x::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark:Web API simulation:x:Python
Instagram crawler:heavy_check_mark::heavy_check_mark::x::x::heavy_check_mark::x::heavy_check_mark::x::x::x::heavy_check_mark::heavy_check_mark::x::heavy_check_mark:Web DOM reading:x:Python:question::question::question:
Instagram Scraper:heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::x::heavy_check_mark::x::heavy_check_mark::x::x::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark:Web API simulation:x:Python:question::question:
Instagram Private API:heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::heavy_check_mark::x::x::x::heavy_check_mark::heavy_check_mark::heavy_check_mark:App and Web API simulation:x:Python:question::question::question:
Instagram PHP Scraper:heavy_check_mark::heavy_check_mark::x::heavy_check_mark::heavy_check_mark::x::heavy_check_mark::heavy_check_mark::x::x::x::heavy_check_mark::heavy_check_mark::heavy_check_mark:Web API simulation:x:PHP:question::question::question::question:
513 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/assets/logo.png -------------------------------------------------------------------------------- /assets/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 15 | 17 | 18 | 20 | 22 | 26 | 30 | 31 | 33 | 37 | 41 | 42 | 44 | 48 | 52 | 53 | 56 | 60 | 61 | 70 | 79 | 88 | 96 | 97 | 101 | 105 | 109 | 113 | 117 | 121 | 122 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _site/ 2 | .jekyll-metadata -------------------------------------------------------------------------------- /docs/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gem "github-pages", group: :jekyll_plugins -------------------------------------------------------------------------------- /docs/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activesupport (4.2.11.1) 5 | i18n (~> 0.7) 6 | minitest (~> 5.1) 7 | thread_safe (~> 0.3, >= 0.3.4) 8 | tzinfo (~> 1.1) 9 | addressable (2.7.0) 10 | public_suffix (>= 2.0.2) 11 | coffee-script (2.4.1) 12 | coffee-script-source 13 | execjs 14 | coffee-script-source (1.11.1) 15 | colorator (1.1.0) 16 | commonmarker (0.17.13) 17 | ruby-enum (~> 0.5) 18 | concurrent-ruby (1.1.5) 19 | dnsruby (1.61.3) 20 | addressable (~> 2.5) 21 | em-websocket (0.5.1) 22 | eventmachine (>= 0.12.9) 23 | http_parser.rb (~> 0.6.0) 24 | ethon (0.12.0) 25 | ffi (>= 1.3.0) 26 | eventmachine (1.2.7) 27 | execjs (2.7.0) 28 | faraday (0.17.0) 29 | multipart-post (>= 1.2, < 3) 30 | ffi (1.11.1) 31 | forwardable-extended (2.6.0) 32 | gemoji (3.0.1) 33 | github-pages (201) 34 | activesupport (= 4.2.11.1) 35 | github-pages-health-check (= 1.16.1) 36 | jekyll (= 3.8.5) 37 | jekyll-avatar (= 0.6.0) 38 | jekyll-coffeescript (= 1.1.1) 39 | jekyll-commonmark-ghpages (= 0.1.6) 40 | jekyll-default-layout (= 0.1.4) 41 | jekyll-feed (= 0.11.0) 42 | jekyll-gist (= 1.5.0) 43 | jekyll-github-metadata (= 2.12.1) 44 | jekyll-mentions (= 1.4.1) 45 | jekyll-optional-front-matter (= 0.3.0) 46 | jekyll-paginate (= 1.1.0) 47 | jekyll-readme-index (= 0.2.0) 48 | jekyll-redirect-from (= 0.14.0) 49 | jekyll-relative-links (= 0.6.0) 50 | jekyll-remote-theme (= 0.4.0) 51 | jekyll-sass-converter (= 1.5.2) 52 | jekyll-seo-tag (= 2.5.0) 53 | jekyll-sitemap (= 1.2.0) 54 | jekyll-swiss (= 0.4.0) 55 | jekyll-theme-architect (= 0.1.1) 56 | jekyll-theme-cayman (= 0.1.1) 57 | jekyll-theme-dinky (= 0.1.1) 58 | jekyll-theme-hacker (= 0.1.1) 59 | jekyll-theme-leap-day (= 0.1.1) 60 | jekyll-theme-merlot (= 0.1.1) 61 | jekyll-theme-midnight (= 0.1.1) 62 | jekyll-theme-minimal (= 0.1.1) 63 | jekyll-theme-modernist (= 0.1.1) 64 | jekyll-theme-primer (= 0.5.3) 65 | jekyll-theme-slate (= 0.1.1) 66 | jekyll-theme-tactile (= 0.1.1) 67 | jekyll-theme-time-machine (= 0.1.1) 68 | jekyll-titles-from-headings (= 0.5.1) 69 | jemoji (= 0.10.2) 70 | kramdown (= 1.17.0) 71 | liquid (= 4.0.0) 72 | listen (= 3.1.5) 73 | mercenary (~> 0.3) 74 | minima (= 2.5.0) 75 | nokogiri (>= 1.10.4, < 2.0) 76 | rouge (= 3.11.0) 77 | terminal-table (~> 1.4) 78 | github-pages-health-check (1.16.1) 79 | addressable (~> 2.3) 80 | dnsruby (~> 1.60) 81 | octokit (~> 4.0) 82 | public_suffix (~> 3.0) 83 | typhoeus (~> 1.3) 84 | html-pipeline (2.12.0) 85 | activesupport (>= 2) 86 | nokogiri (>= 1.4) 87 | http_parser.rb (0.6.0) 88 | i18n (0.9.5) 89 | concurrent-ruby (~> 1.0) 90 | jekyll (3.8.5) 91 | addressable (~> 2.4) 92 | colorator (~> 1.0) 93 | em-websocket (~> 0.5) 94 | i18n (~> 0.7) 95 | jekyll-sass-converter (~> 1.0) 96 | jekyll-watch (~> 2.0) 97 | kramdown (~> 1.14) 98 | liquid (~> 4.0) 99 | mercenary (~> 0.3.3) 100 | pathutil (~> 0.9) 101 | rouge (>= 1.7, < 4) 102 | safe_yaml (~> 1.0) 103 | jekyll-avatar (0.6.0) 104 | jekyll (~> 3.0) 105 | jekyll-coffeescript (1.1.1) 106 | coffee-script (~> 2.2) 107 | coffee-script-source (~> 1.11.1) 108 | jekyll-commonmark (1.3.1) 109 | commonmarker (~> 0.14) 110 | jekyll (>= 3.7, < 5.0) 111 | jekyll-commonmark-ghpages (0.1.6) 112 | commonmarker (~> 0.17.6) 113 | jekyll-commonmark (~> 1.2) 114 | rouge (>= 2.0, < 4.0) 115 | jekyll-default-layout (0.1.4) 116 | jekyll (~> 3.0) 117 | jekyll-feed (0.11.0) 118 | jekyll (~> 3.3) 119 | jekyll-gist (1.5.0) 120 | octokit (~> 4.2) 121 | jekyll-github-metadata (2.12.1) 122 | jekyll (~> 3.4) 123 | octokit (~> 4.0, != 4.4.0) 124 | jekyll-mentions (1.4.1) 125 | html-pipeline (~> 2.3) 126 | jekyll (~> 3.0) 127 | jekyll-optional-front-matter (0.3.0) 128 | jekyll (~> 3.0) 129 | jekyll-paginate (1.1.0) 130 | jekyll-readme-index (0.2.0) 131 | jekyll (~> 3.0) 132 | jekyll-redirect-from (0.14.0) 133 | jekyll (~> 3.3) 134 | jekyll-relative-links (0.6.0) 135 | jekyll (~> 3.3) 136 | jekyll-remote-theme (0.4.0) 137 | addressable (~> 2.0) 138 | jekyll (~> 3.5) 139 | rubyzip (>= 1.2.1, < 3.0) 140 | jekyll-sass-converter (1.5.2) 141 | sass (~> 3.4) 142 | jekyll-seo-tag (2.5.0) 143 | jekyll (~> 3.3) 144 | jekyll-sitemap (1.2.0) 145 | jekyll (~> 3.3) 146 | jekyll-swiss (0.4.0) 147 | jekyll-theme-architect (0.1.1) 148 | jekyll (~> 3.5) 149 | jekyll-seo-tag (~> 2.0) 150 | jekyll-theme-cayman (0.1.1) 151 | jekyll (~> 3.5) 152 | jekyll-seo-tag (~> 2.0) 153 | jekyll-theme-dinky (0.1.1) 154 | jekyll (~> 3.5) 155 | jekyll-seo-tag (~> 2.0) 156 | jekyll-theme-hacker (0.1.1) 157 | jekyll (~> 3.5) 158 | jekyll-seo-tag (~> 2.0) 159 | jekyll-theme-leap-day (0.1.1) 160 | jekyll (~> 3.5) 161 | jekyll-seo-tag (~> 2.0) 162 | jekyll-theme-merlot (0.1.1) 163 | jekyll (~> 3.5) 164 | jekyll-seo-tag (~> 2.0) 165 | jekyll-theme-midnight (0.1.1) 166 | jekyll (~> 3.5) 167 | jekyll-seo-tag (~> 2.0) 168 | jekyll-theme-minimal (0.1.1) 169 | jekyll (~> 3.5) 170 | jekyll-seo-tag (~> 2.0) 171 | jekyll-theme-modernist (0.1.1) 172 | jekyll (~> 3.5) 173 | jekyll-seo-tag (~> 2.0) 174 | jekyll-theme-primer (0.5.3) 175 | jekyll (~> 3.5) 176 | jekyll-github-metadata (~> 2.9) 177 | jekyll-seo-tag (~> 2.0) 178 | jekyll-theme-slate (0.1.1) 179 | jekyll (~> 3.5) 180 | jekyll-seo-tag (~> 2.0) 181 | jekyll-theme-tactile (0.1.1) 182 | jekyll (~> 3.5) 183 | jekyll-seo-tag (~> 2.0) 184 | jekyll-theme-time-machine (0.1.1) 185 | jekyll (~> 3.5) 186 | jekyll-seo-tag (~> 2.0) 187 | jekyll-titles-from-headings (0.5.1) 188 | jekyll (~> 3.3) 189 | jekyll-watch (2.2.1) 190 | listen (~> 3.0) 191 | jemoji (0.10.2) 192 | gemoji (~> 3.0) 193 | html-pipeline (~> 2.2) 194 | jekyll (~> 3.0) 195 | kramdown (1.17.0) 196 | liquid (4.0.0) 197 | listen (3.1.5) 198 | rb-fsevent (~> 0.9, >= 0.9.4) 199 | rb-inotify (~> 0.9, >= 0.9.7) 200 | ruby_dep (~> 1.2) 201 | mercenary (0.3.6) 202 | mini_portile2 (2.4.0) 203 | minima (2.5.0) 204 | jekyll (~> 3.5) 205 | jekyll-feed (~> 0.9) 206 | jekyll-seo-tag (~> 2.1) 207 | minitest (5.12.2) 208 | multipart-post (2.1.1) 209 | nokogiri (1.10.8) 210 | mini_portile2 (~> 2.4.0) 211 | octokit (4.14.0) 212 | sawyer (~> 0.8.0, >= 0.5.3) 213 | pathutil (0.16.2) 214 | forwardable-extended (~> 2.6) 215 | public_suffix (3.1.1) 216 | rb-fsevent (0.10.3) 217 | rb-inotify (0.10.0) 218 | ffi (~> 1.0) 219 | rouge (3.11.0) 220 | ruby-enum (0.7.2) 221 | i18n 222 | ruby_dep (1.5.0) 223 | rubyzip (2.0.0) 224 | safe_yaml (1.0.5) 225 | sass (3.7.4) 226 | sass-listen (>= 4.0.0) 227 | sass-listen (4.0.0) 228 | rb-inotify (>= 0.9.7, >= 0.9) 229 | sawyer (0.8.2) 230 | addressable (>= 2.3.5) 231 | faraday (> 0.8, < 2.0) 232 | terminal-table (1.8.0) 233 | unicode-display_width (~> 1.1, >= 1.1.1) 234 | thread_safe (0.3.6) 235 | typhoeus (1.3.1) 236 | ethon (>= 0.9.0) 237 | tzinfo (1.2.5) 238 | thread_safe (~> 0.1) 239 | unicode-display_width (1.6.0) 240 | 241 | PLATFORMS 242 | ruby 243 | 244 | DEPENDENCIES 245 | github-pages 246 | 247 | BUNDLED WITH 248 | 2.0.1 249 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | title: Instamancer 2 | logo: /assets/img/logo.png 3 | description: Scrape Instagram's API with Puppeteer. 4 | show_downloads: false 5 | theme: jekyll-theme-minimal 6 | repository: ScriptSmith/instamancer 7 | google_analytics: UA-79900226-3 8 | -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | {% if site.logo %} 9 | 10 | {% endif %} 11 | 12 | {% seo %} 13 | 14 | 17 | 18 | 19 |
20 |
21 |

22 | {% if site.logo %} 23 | Logo 24 | {% endif %} 25 | {{ site.title | default: site.github.repository_name }} 26 |

27 | 28 |

{{ site.description | default: site.github.project_tagline }}

29 | 30 | {% if site.github.is_project_page %} 31 |

View the Project on GitHub {{ site.github.repository_nwo }}

32 | {% endif %} 33 | 34 | {% if site.github.is_user_page %} 35 |

View My GitHub Profile

36 | {% endif %} 37 | 38 | {% if site.show_downloads %} 39 | 44 | {% endif %} 45 |
46 |
47 | 48 | {{ content }} 49 | 50 |
51 | 57 |
58 | 59 | {% if site.google_analytics %} 60 | 68 | {% endif %} 69 | 70 | 71 | -------------------------------------------------------------------------------- /docs/api-change.md: -------------------------------------------------------------------------------- 1 | # The Instagram API has changed 2 | Because of the way instamancer works, when Instagram changes the API for their web frontend, the data that Instamancer gathers will be affected. 3 | 4 | If you see this warning, you can: 5 | 6 | - Check for [updates](https://github.com/ScriptSmith/instamancer/releases). A new version of instamancer may have been released. 7 | - Look reports in [open issues](https://github.com/ScriptSmith/instamancer/issues). Maybe someone else is having this problem, and is already working on a fix. 8 | - Open a [new issue](https://github.com/ScriptSmith/instamancer/issues/new/choose) if you can't find an existing one. 9 | - Create a fork of the repository and [fix the typings](https://github.com/ScriptSmith/instamancer/blob/master/utils/validation-generator/README.md#fix-typings) yourself. 10 | -------------------------------------------------------------------------------- /docs/assets/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/docs/assets/img/logo.png -------------------------------------------------------------------------------- /docs/capture.webm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/docs/capture.webm -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/docs/favicon.ico -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | # About Instamancer 6 | 7 | Instamancer is a scraping tool used in Instagram data mining and analysis projects. 8 | 9 | Traditional Instagram scrapers either use a browser to access a web-page and read the DOM, or they manually reimplement the requests that browsers make to an API endpoint. This isn't ideal because: 10 | 11 | 1. Reading the DOM ignores some information that's only stored in memory. 12 | 2. Reimplementing requests requires the deciphering and reproduction of pagination and authentication mechanisms. 13 | 3. Both methods don't easily tolerate changes to the front and back end. 14 | 15 | Instamancer is unique because it doesn't read the DOM or reimplement requests. Using [Puppeteer](https://github.com/GoogleChrome/puppeteer/) it interacts with Instagram.com, then intercepts and saves the responses to requests that the page's JavaScript initiates. This means that it can retrieve the full amount of information from the API while tolerating failed requests and rate limits, without having to reimplement client-side code. This makes it much better at withstanding regular changes to the interface and API. 16 | 17 | As browsers become more and more like black boxes, this new scraping method will become increasingly relevant. 18 | 19 | Instamancer also comes with some clever tricks: 20 | 21 | - Because using a browser consumes lots of memory in large scraping jobs, Instamancer employs a new scraping technique called *grafting*. It intercepts and saves the URL and headers of each request, and then after a certain number of interactions with the page it will restart the browser and navigate back to the same page. Once the page initiates the first request to the API, its URL and headers are swapped on-the-fly with the most recently saved ones. The scraping continues without incident because the response from the API is in the correct form despite being for the incorrect data. 22 | - Requests from pages for media and other non-API urls are intercepted and aborted to speed up scraping and conserve resources. 23 | - Instagram sends limited information through its feed API. To get extra information like the location, tagged users, and comments, Instamancer can open new tabs for each post that it scrapes, and then read the metadata from memory. 24 | 25 | # Installation 26 | 27 | To get started with Instamancer, follow the installation instructions [here](https://github.com/ScriptSmith/instamancer#Install) 28 | 29 | # Output 30 | 31 | ## Metadata 32 | 33 | Instamancer outputs metadata into JSON and CSV files. 34 | 35 | Here's a sample of output without `--full` mode: 36 | 37 | ```json 38 | [ 39 | { 40 | "node": { 41 | "comments_disabled": false, 42 | "__typename": "GraphImage", 43 | "id": "1953636359851103977", 44 | "edge_media_to_caption": { 45 | "edges": [ 46 | { 47 | "node": { 48 | "text": "Love my #dogs" 49 | } 50 | } 51 | ] 52 | }, 53 | "shortcode": "BsrrAClca9F", 54 | "edge_media_to_comment": { 55 | "count": 1 56 | }, 57 | "taken_at_timestamp": 1547102918, 58 | "dimensions": { 59 | "height": 1350, 60 | "width": 1080 61 | }, 62 | "display_url": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5edccf8779ca7659a5ee7bb3e5bb0ec4/5CD38B5F/t51.2885-15/e35/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 63 | "edge_liked_by": { 64 | "count": 3 65 | }, 66 | "edge_media_preview_like": { 67 | "count": 3 68 | }, 69 | "owner": { 70 | "id": "1838071775" 71 | }, 72 | "thumbnail_src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5d074edce4bd1bdb02cadb670dd62571/5CBF791C/t51.2885-15/sh0.08/e35/c0.135.1080.1080/s640x640/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 73 | "thumbnail_resources": [ 74 | { 75 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/418024ac735200f61193e0de0bc2b79f/5CC9DD07/t51.2885-15/e35/c0.135.1080.1080/s150x150/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 76 | "config_width": 150, 77 | "config_height": 150 78 | }, 79 | { 80 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/ca0843efc1fa41da05f401d1d2d99c80/5CC6C84D/t51.2885-15/e35/c0.135.1080.1080/s240x240/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 81 | "config_width": 240, 82 | "config_height": 240 83 | }, 84 | { 85 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5560c9aa0cbaf43d93b9f57da63f46ae/5CD068F7/t51.2885-15/e35/c0.135.1080.1080/s320x320/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 86 | "config_width": 320, 87 | "config_height": 320 88 | }, 89 | { 90 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/1842510041138b9f71cba3a7e7991f47/5CCEDFAD/t51.2885-15/e35/c0.135.1080.1080/s480x480/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 91 | "config_width": 480, 92 | "config_height": 480 93 | }, 94 | { 95 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5d074edce4bd1bdb02cadb670dd62571/5CBF791C/t51.2885-15/sh0.08/e35/c0.135.1080.1080/s640x640/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 96 | "config_width": 640, 97 | "config_height": 640 98 | } 99 | ], 100 | "is_video": false, 101 | "accessibility_caption": "Image may contain: 1 person, dog, outdoor, closeup, water and nature" 102 | } 103 | } 104 | ] 105 | ``` 106 | 107 | And with `--full` mode: 108 | 109 | ```json 110 | [ 111 | { 112 | "shortcode_media": { 113 | "__typename": "GraphImage", 114 | "id": "1958565413572638000", 115 | "shortcode": "BsHcdeHyEgY", 116 | "dimensions": { 117 | "height": 1349, 118 | "width": 1080 119 | }, 120 | "gating_info": null, 121 | "media_preview": "ACEqQWKuuSmQWblCFPU8YPGPQc/WpoLWFSGRSpzwWJB7cH056eh9qvwlU3KeNrn8m+YfzxUMk8e4gfNn7wHT/wDXWd/IuwGMEkgsPXJ6H3zntR5Jzwcj04/nj/61EU6sflOT6Hr/APXHp3Hv0q0u05wMHuPShyt0CxT2D3/Sirm3/OKKXMPlM1sgEjq2B9akEIhXL/KfXGc/l/KoTcQHlWJOeBtPH/6qtI5ZQT0HXP8AP/69Uk3sK6RlzRuMMePccf59s81dUtcEGV9igAfLwT6kt2B9KDdI+VABx0yOPc89hWRK/mOxXkHpgZz/APWoa7iuafl2v/PWisjn+7RRYdyM8+/+f896lhmki+UHcB/CeR/n8cVWJI6U89BVElmS5aRQDg47AYHr+P0qMEkZ7UHq30/pSpSAb8vvRU2BRQM//9k=", 122 | "display_url": "https://instagram.fbne3-1.fna.fbcdn.net/vp/ff493b6b24e6e2be7df1ec9644d5339c/5CD16638/t51.2885-15/e35/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 123 | "display_resources": [ 124 | { 125 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5150c80ce526c6f6bd4da78e4f57979f/5CBBD552/t51.2885-15/sh0.08/e35/p640x640/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 126 | "config_width": 640, 127 | "config_height": 799 128 | }, 129 | { 130 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/54585546542f3fae7f25ab23d219fd75/5CB87296/t51.2885-15/sh0.08/e35/p750x750/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 131 | "config_width": 750, 132 | "config_height": 937 133 | }, 134 | { 135 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/ff493b6b24e6e2be7df1ec9644d5339c/5CD16638/t51.2885-15/e35/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 136 | "config_width": 1080, 137 | "config_height": 1349 138 | } 139 | ], 140 | "accessibility_caption": "Image may contain: dog", 141 | "is_video": false, 142 | "should_log_client_event": false, 143 | "tracking_token": "eyJ2ZXJzaW9uIjo1LCJwYXlsb2FkIjp7ImlzX2FdGGelBj5c190cmFjJa2VkIjpmYWxzZSwidXVpZCI6IjRlODVlYjAyYzdmYjRmMmViNWYwNzg1ODZlZjRhZTEwMTk1MzU2NDE4NDYzNTI2MzAwMCJ9LCJzaWduYXR1cmUiOiIifQ==", 144 | "edge_media_to_tagged_user": { 145 | "edges": [] 146 | }, 147 | "edge_media_to_caption": { 148 | "edges": [ 149 | { 150 | "node": { 151 | "text": "Cool pic #dogs 👌🏻" 152 | } 153 | } 154 | ] 155 | }, 156 | "caption_is_edited": false, 157 | "has_ranked_comments": false, 158 | "edge_media_to_comment": { 159 | "count": 0, 160 | "page_info": { 161 | "has_next_page": false, 162 | "end_cursor": null 163 | }, 164 | "edges": [] 165 | }, 166 | "comments_disabled": false, 167 | "taken_at_timestamp": 1547103020, 168 | "edge_media_preview_like": { 169 | "count": 3, 170 | "edges": [] 171 | }, 172 | "edge_media_to_sponsor_user": { 173 | "edges": [] 174 | }, 175 | "location": null, 176 | "viewer_has_liked": false, 177 | "viewer_has_saved": false, 178 | "viewer_has_saved_to_collection": false, 179 | "viewer_in_photo_of_you": false, 180 | "viewer_can_reshare": true, 181 | "owner": { 182 | "id": "7050323018", 183 | "is_verified": false, 184 | "profile_pic_url": "https://instagram.fbne3-1.fna.fbcdn.net/vp/0859933bacb7ef085efcd513c7336f21/5CCBC50C/t51.2885-19/s150x150/47446882_612896971943840_3814256767933636272_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net", 185 | "username": "user.name", 186 | "blocked_by_viewer": false, 187 | "followed_by_viewer": false, 188 | "full_name": "Full name", 189 | "has_blocked_viewer": false, 190 | "is_private": false, 191 | "is_unpublished": false, 192 | "requested_by_viewer": false 193 | }, 194 | "is_ad": false, 195 | "edge_web_media_to_related_media": { 196 | "edges": [] 197 | } 198 | } 199 | } 200 | ] 201 | 202 | ``` 203 | 204 | ## Media 205 | To download media as well as scrape metadata, include the `-d` flag. By default, Instamancer downloads the highest-quality image available for each post. 206 | 207 | By enabling full mode with `--full`, all images in albums are downloaded as well. 208 | 209 | Videos are downloaded when the `--video` flag is used along with `--full`. 210 | 211 | The default download location for media is `downloads/[endpoint]/[id]`. This can be changed with the `--downdir` flag. 212 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | See the command-line interface in action [here](https://scriptsmith.github.io/instamancer), and instructions and examples [here](../README.md#command-line) 4 | 5 | |Name|Description| 6 | |---------------------------------------|----------------------------------------------------------| 7 | |[Express server](server.ts) |Express server acting as an API endpoint | 8 | |[Page complexity plugin](complexity.ts)|Plugin that outputs the number of DOM elements on the page| 9 | 10 | Please suggest more examples with a pull request 11 | -------------------------------------------------------------------------------- /examples/complexity.ts: -------------------------------------------------------------------------------- 1 | import * as instamancer from "instamancer"; 2 | import {Response} from "puppeteer"; 3 | 4 | class Complexity implements instamancer.IPlugin { 5 | private query: string; 6 | 7 | constructor(query: string) { 8 | this.query = query; 9 | } 10 | 11 | public async responseEvent( 12 | this: instamancer.IPluginContext, PostType>, 13 | res: Response, 14 | data: {[key: string]: any}, 15 | ): Promise { 16 | const elementCount = await this.state.page.evaluate((query) => { 17 | return document.querySelectorAll(query).length; 18 | }, this.plugin.query); 19 | process.stdout.write( 20 | `${this.plugin.query} elements: ${elementCount}\n`, 21 | ); 22 | } 23 | } 24 | 25 | const user = instamancer.createApi("user", "therock", { 26 | enableGrafting: false, 27 | plugins: [ 28 | new Complexity("div"), 29 | new Complexity("span"), 30 | new Complexity("img"), 31 | ], 32 | silent: true, 33 | total: 500, 34 | }); 35 | 36 | (async () => { 37 | const posts: instamancer.TPost[] = []; 38 | for await (const post of user.generator()) { 39 | posts.push(post); 40 | } 41 | 42 | process.stdout.write(`Total posts ${posts.length}`); 43 | })(); 44 | -------------------------------------------------------------------------------- /examples/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "instamancer-examples", 3 | "version": "1.0.0", 4 | "description": "Examples of instamancer usage", 5 | "author": "ScriptSmith", 6 | "license": "MIT-0", 7 | "dependencies": { 8 | "express": "^4.17.1", 9 | "instamancer": "file:..", 10 | "puppeteer": "^1.20.0" 11 | }, 12 | "devDependencies": { 13 | "@types/puppeteer": "^1.20.2" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /examples/server.ts: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import * as instamancer from "instamancer"; 3 | 4 | const app = express(); 5 | const port = 3000; 6 | 7 | async function getPosts(tag: string): Promise { 8 | const hashtag = instamancer.createApi("hashtag", tag, { 9 | total: 5, 10 | }); 11 | const posts = []; 12 | 13 | for await (const post of hashtag.generator()) { 14 | posts.push(post); 15 | } 16 | 17 | return posts; 18 | } 19 | 20 | let cachedPosts: instamancer.TPost[] = []; 21 | 22 | async function getCached() { 23 | cachedPosts = await getPosts("puppies"); 24 | } 25 | setTimeout(getCached, 3000); 26 | 27 | app.get("/cached", async (req, res) => { 28 | res.json(cachedPosts); 29 | }); 30 | 31 | app.get("/live", async (req, res) => { 32 | if ("tag" in req.params) { 33 | const posts = await getPosts(req.params.tag); 34 | res.json(posts); 35 | } 36 | }); 37 | 38 | app.listen(port, () => 39 | process.stdout.write(`Example app listening on port ${port}!\n`), 40 | ); 41 | -------------------------------------------------------------------------------- /examples/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Basic Options */ 4 | // "incremental": true, /* Enable incremental compilation */ 5 | "target": "es5" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019' or 'ESNEXT'. */, 6 | "module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */, 7 | // "lib": [], /* Specify library files to be included in the compilation. */ 8 | // "allowJs": true, /* Allow javascript files to be compiled. */ 9 | // "checkJs": true, /* Report errors in .js files. */ 10 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 11 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 12 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 13 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 14 | // "outFile": "./", /* Concatenate and emit output to single file. */ 15 | // "outDir": "./", /* Redirect output structure to the directory. */ 16 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 17 | // "composite": true, /* Enable project compilation */ 18 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 19 | // "removeComments": true, /* Do not emit comments to output. */ 20 | // "noEmit": true, /* Do not emit outputs. */ 21 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 22 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 23 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 24 | 25 | /* Strict Type-Checking Options */ 26 | "strict": true /* Enable all strict type-checking options. */, 27 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 28 | // "strictNullChecks": true, /* Enable strict null checks. */ 29 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 30 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 31 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 32 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 33 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 34 | 35 | /* Additional Checks */ 36 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 37 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 38 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 39 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 40 | 41 | /* Module Resolution Options */ 42 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 43 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 44 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 45 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 46 | // "typeRoots": [], /* List of folders to include type definitions from. */ 47 | // "types": [], /* Type declaration files to be included in compilation. */ 48 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 49 | "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 50 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 51 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 52 | 53 | /* Source Map Options */ 54 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 55 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 56 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 57 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 58 | 59 | /* Experimental Options */ 60 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 61 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /examples/tslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultSeverity": "error", 3 | "extends": ["tslint:recommended"], 4 | "jsRules": {}, 5 | "rules": {}, 6 | "rulesDirectory": [] 7 | } 8 | -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | import {createApi} from "./src/api/api"; 2 | 3 | export { 4 | Hashtag, 5 | Post, 6 | User, 7 | IOptions, 8 | createApi, 9 | IOptionsCommon, 10 | IOptionsFullApi, 11 | IOptionsRegular, 12 | } from "./src/api/api"; 13 | export {Instagram} from "./src/api/instagram"; 14 | export {TSearchResult, ISearchOptions} from "./src/api/search"; 15 | export {TPost, TSinglePost, TFullApiPost} from "./src/api/types"; 16 | 17 | export * from "./plugins"; 18 | -------------------------------------------------------------------------------- /man/instamancer.1: -------------------------------------------------------------------------------- 1 | .\" Manpage for instamancer. 2 | .TH Instamancer 1 3 | .SH NAME 4 | instamancer \- Scrape Instagram's API with Puppeteer 5 | .SH SYNOPSIS 6 | .B instamancer 7 | [\fIoptions\fR] 8 | .IR command 9 | .IR query 10 | .SH DESCRIPTION 11 | Instamancer is an Instagram scraper that uses Puppeteer to control a chromium / chrome browser instance and intercept requests made to APIs. 12 | 13 | Instamancer scrapes hashtags, users, search results, and individual posts. 14 | 15 | Both data and media can be scraped, and then saved to disk or uploaded to external object storage. 16 | 17 | The plugin system can be used to extend instamancer and add other functionality. 18 | .SH OPTIONS 19 | .TP 20 | .BR \-h ", " \-\-help 21 | Show the list of options and examples 22 | .SH SEE ALSO 23 | The Instamancer project and further documentation can be accessed at https://github.com/ScriptSmith/instamancer 24 | .SH BUGS 25 | Please report bugs at 26 | https://github.com/ScriptSmith/instamancer/issues 27 | .SH AUTHOR 28 | Adam Smith https://github.com/ScriptSmith 29 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "instamancer", 3 | "version": "3.3.1", 4 | "description": "Scrape the Instagram API with Puppeteer", 5 | "main": "index.js", 6 | "types": "index.d.ts", 7 | "bin": { 8 | "instamancer": "src/cli.js" 9 | }, 10 | "man": [ 11 | "./man/instamancer.1" 12 | ], 13 | "files": [ 14 | "index.js", 15 | "index.d.ts", 16 | "src/**/*.js", 17 | "src/**/*.d.ts", 18 | "plugins/*.js", 19 | "plugins/*.d.ts", 20 | "plugins/**/*.js", 21 | "plugins/**/*.d.ts" 22 | ], 23 | "scripts": { 24 | "build": "tsc", 25 | "prepack": "tsc --declaration", 26 | "test": "jest --env=node", 27 | "test:ci": "jest --forceExit --env=node", 28 | "lint": "tslint -p tsconfig.json -p tests/tsconfig.json", 29 | "lint:fix": "npm run lint -- --fix", 30 | "prettier": "prettier --write \"{src,tests}/**/*.ts\"", 31 | "clean": "rimraf src/**/*{.js,.d.ts} src/*{.js,.d.ts} plugins/**/*{.js,.d.ts} plugins/*{.js,.d.ts} tests/**/*{.js,.d.ts} tests/*{.js,.d.ts} examples/*{.js,.d.ts} index{.js,.d.ts} *.log" 32 | }, 33 | "author": "ScriptSmith", 34 | "license": "MIT", 35 | "keywords": [ 36 | "instagram", 37 | "instagram api", 38 | "data mining", 39 | "scraping" 40 | ], 41 | "dependencies": { 42 | "await-lock": "^2.0.1", 43 | "aws-sdk": "^2.715.0", 44 | "axios": "^0.19.2", 45 | "chalk": "^4.1.0", 46 | "env-paths": "^2.2.0", 47 | "fp-ts": "^2.7.0", 48 | "io-ts": "^2.2.9", 49 | "io-ts-excess": "^1.0.1", 50 | "json2csv": "^5.0.1", 51 | "lodash": "^4.17.19", 52 | "puppeteer": "^5.2.0", 53 | "tmp": "^0.2.1", 54 | "uuid": "^8.2.0", 55 | "winston": "^3.3.3", 56 | "yargs": "^15.4.1" 57 | }, 58 | "engines": { 59 | "node": ">=10.15.0" 60 | }, 61 | "repository": { 62 | "type": "git", 63 | "url": "git@github.com:ScriptSmith/instamancer.git" 64 | }, 65 | "devDependencies": { 66 | "@types/aws-sdk": "^2.7.0", 67 | "@types/concat-stream": "^1.6.0", 68 | "@types/express": "^4.17.7", 69 | "@types/jest": "^26.0.4", 70 | "@types/json2csv": "^5.0.1", 71 | "@types/node": "^14.0.23", 72 | "@types/tmp": "^0.2.0", 73 | "@types/uuid": "^8.0.0", 74 | "@types/yargs": "^15.0.5", 75 | "express": "^4.17.1", 76 | "husky": "^4.2.5", 77 | "jest": "^26.1.0", 78 | "lint-staged": "^10.2.11", 79 | "prettier": "^2.0.5", 80 | "rimraf": "^3.0.2", 81 | "transform-json-types": "^0.7.0", 82 | "ts-jest": "^26.1.3", 83 | "tslint": "^6.1.2", 84 | "typescript": "^3.9.7" 85 | }, 86 | "jest": { 87 | "coverageDirectory": "./coverage/", 88 | "collectCoverage": true, 89 | "preset": "ts-jest", 90 | "transform": { 91 | "^.+\\.(ts|tsx)$": "ts-jest" 92 | } 93 | }, 94 | "husky": { 95 | "hooks": { 96 | "pre-commit": "lint-staged && npm run lint" 97 | } 98 | }, 99 | "prettier": { 100 | "trailingComma": "all", 101 | "arrowParens": "always", 102 | "bracketSpacing": false, 103 | "tabWidth": 4 104 | }, 105 | "lint-staged": { 106 | "*.json": [ 107 | "prettier --write", 108 | "git add" 109 | ], 110 | "*.ts": [ 111 | "prettier --write", 112 | "tslint --fix", 113 | "git add" 114 | ] 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /plugins/README.md: -------------------------------------------------------------------------------- 1 | # Plugins 2 | 3 | Plugins allow you to modify instamancer's functionality and behavior while gathering data. 4 | 5 | The following internal plugins are included with instamancer (but not enabled by default): 6 | 7 | |Plugin |Description | 8 | |----------|-------------------------------------------------------------------| 9 | |LargeFirst|Increase the `first` parameter in API requests to ask for more data| 10 | 11 | ## Using plugins with the CLI 12 | 13 | Example: 14 | 15 | ``` 16 | instamancer hashtag puppies -c1000 --plugin LargeFirst --plugin MyPlugin 17 | ``` 18 | 19 | ## Using external plugins with the CLI 20 | 21 | To install external plugins, you need to clone and install instamancer from source 22 | 23 | Steps: 24 | 25 | 1. Clone the instamancer repository 26 | 2. Install instamancer's dependencies 27 | 3. Install the plugin with npm / yarn 28 | 4. Add the plugin to `plugins/plugins/index.ts` 29 | 30 | Example: 31 | 32 | 33 | ``` typescript 34 | export { MyPlugin } from "myplugin"; 35 | ``` 36 | 37 | 5. Install instamancer 38 | 1. You can skip this step if you want to run the CLI from source 39 | 6. Run the CLI with the plugin: 40 | 41 | 42 | Example: 43 | 44 | 45 | ``` 46 | instamancer hashtag puppies -c100 --plugin MyPlugin 47 | ``` 48 | 49 | ## Using plugins with the module 50 | 51 | Add the plugin to the `options` : 52 | 53 | ``` typescript 54 | import * as instamancer from "."; 55 | 56 | const options: instamancer.IOptions = { 57 | plugins: [new instamancer.plugins.LargeFirst()], 58 | silent: true, 59 | total: 100, 60 | }; 61 | const hashtag = instamancer.createApi("hashtag", "puppies", options); 62 | 63 | (async () => { 64 | for await (const post of hashtag.generator()) { 65 | console.log(post); 66 | } 67 | })(); 68 | 69 | ``` 70 | -------------------------------------------------------------------------------- /plugins/index.ts: -------------------------------------------------------------------------------- 1 | import * as allPlugins from "./plugins"; 2 | 3 | export const plugins = allPlugins; 4 | export * from "./plugin"; 5 | -------------------------------------------------------------------------------- /plugins/plugin.ts: -------------------------------------------------------------------------------- 1 | import * as puppeteer from "puppeteer"; 2 | import {Instagram, TFullApiPost, TPost, TSearchResult, TSinglePost} from ".."; 3 | 4 | export type DType = TPost | TSinglePost | TFullApiPost | TSearchResult; 5 | 6 | export interface IPluginContext { 7 | plugin: Plugin; 8 | state: Instagram; 9 | } 10 | 11 | export interface IPlugin { 12 | constructionEvent?(this: IPluginContext, PostType>): void; 13 | 14 | requestEvent?( 15 | this: IPluginContext, PostType>, 16 | req: puppeteer.Request, 17 | overrides: puppeteer.Overrides, 18 | ): Promise; 19 | 20 | responseEvent?( 21 | this: IPluginContext, PostType>, 22 | res: puppeteer.Response, 23 | data: {[key: string]: any}, 24 | ): Promise; 25 | 26 | postPageEvent?( 27 | this: IPluginContext, PostType>, 28 | data: PostType, 29 | ): Promise; 30 | 31 | graftingEvent?( 32 | this: IPluginContext, PostType>, 33 | ): Promise; 34 | } 35 | 36 | export enum AsyncPluginEvents { 37 | browser, 38 | grafting, 39 | postPage, 40 | request, 41 | response, 42 | } 43 | 44 | export type AsyncPluginEventsType = keyof typeof AsyncPluginEvents; 45 | 46 | export enum SyncPluginEvents { 47 | construction, 48 | } 49 | 50 | export type SyncPluginEventsType = keyof typeof SyncPluginEvents; 51 | 52 | export type PluginEventsType = SyncPluginEventsType | AsyncPluginEventsType; 53 | -------------------------------------------------------------------------------- /plugins/plugins/index.ts: -------------------------------------------------------------------------------- 1 | export {LargeFirst} from "./largeFirst"; 2 | 3 | // Add your own plugins here 4 | -------------------------------------------------------------------------------- /plugins/plugins/largeFirst.ts: -------------------------------------------------------------------------------- 1 | import {Overrides, Request} from "puppeteer"; 2 | import * as querystring from "querystring"; 3 | import {format as urlFormat, parse as urlParse} from "url"; 4 | import {IPlugin, IPluginContext} from "../plugin"; 5 | 6 | export class LargeFirst implements IPlugin { 7 | public constructionEvent( 8 | this: IPluginContext, PostType>, 9 | ): void { 10 | this.state.jumpSize = 150; 11 | } 12 | 13 | public async requestEvent(req: Request, overrides: Overrides) { 14 | const url = overrides["url"] ? overrides["url"] : req.url(); 15 | const parsedUrl = urlParse(url); 16 | const query = querystring.parse(parsedUrl.query); 17 | const variables = JSON.parse(query["variables"] as string); 18 | 19 | variables.first = 50; 20 | 21 | query.variables = JSON.stringify(variables); 22 | parsedUrl.search = "?" + querystring.stringify(query); 23 | overrides["url"] = urlFormat(parsedUrl); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/api/api.ts: -------------------------------------------------------------------------------- 1 | import {Type} from "io-ts"; 2 | import {Browser} from "puppeteer"; 3 | import * as winston from "winston"; 4 | import {DType, IPlugin} from "../../plugins"; 5 | import {Instagram} from "./instagram"; 6 | import { 7 | ISearchOptions, 8 | ISearchOptionsPlugins, 9 | Search, 10 | TSearchResult, 11 | } from "./search"; 12 | import { 13 | FullApiPost, 14 | Post as PostValidator, 15 | SinglePost, 16 | TFullApiPost, 17 | TPost, 18 | TSinglePost, 19 | } from "./types"; 20 | 21 | /** 22 | * Optional arguments for the API 23 | */ 24 | export interface IOptionsCommon { 25 | // Total posts to download. 0 for unlimited 26 | total?: number; 27 | 28 | // Run Chrome in headless mode 29 | headless?: boolean; 30 | 31 | // Logging events 32 | logger?: winston.Logger; 33 | 34 | // Run without output to stdout 35 | silent?: boolean; 36 | 37 | // Time to sleep between interactions with the page 38 | sleepTime?: number; 39 | 40 | // Throw an error if type validation has been failed 41 | strict?: boolean; 42 | 43 | // Time to sleep when rate-limited 44 | hibernationTime?: number; 45 | 46 | // Enable the grafting process 47 | enableGrafting?: boolean; 48 | 49 | // Use the same browser instance when grafting 50 | sameBrowser?: boolean; 51 | 52 | // Extract the full amount of information from the API 53 | fullAPI?: boolean; 54 | 55 | // Use a proxy in Chrome to connect to Instagram 56 | proxyURL?: string; 57 | 58 | // Location of the chromium / chrome binary executable 59 | executablePath?: string; 60 | 61 | // Custom io-ts validator 62 | validator?: Type; 63 | 64 | // Pass puppeter Browser instance from outside. 65 | // Be careful to close Browser by yourself, when there is no need in it anymore. 66 | browserInstance?: Browser; 67 | } 68 | 69 | export interface IOptionsFullApi extends IOptionsCommon { 70 | fullAPI: true; 71 | } 72 | 73 | export interface IOptionsRegular extends IOptionsCommon { 74 | fullAPI?: false; 75 | } 76 | 77 | export interface IOptionsFullApiPlugins extends IOptionsFullApi { 78 | plugins?: IPlugin[]; 79 | } 80 | 81 | export interface IOptionsRegularPlugins extends IOptionsRegular { 82 | plugins?: IPlugin[]; 83 | } 84 | 85 | export type IOptions = 86 | | IOptionsFullApi 87 | | IOptionsRegular 88 | | IOptionsFullApiPlugins 89 | | IOptionsRegularPlugins; 90 | 91 | /** 92 | * An Instagram post API wrapper 93 | */ 94 | export class Post extends Instagram { 95 | // Post ids 96 | private readonly ids: string[]; 97 | 98 | constructor(ids: string[], options: IOptions = {}) { 99 | // fullAPI option makes no sense for Post class 100 | // But usage with fullAPI option brings an extra post, because of scrapeDefaultPosts 101 | // So we force it to be disabled 102 | options.fullAPI = false; 103 | super( 104 | "https://instagram.com/p/[id]", 105 | ids[0], 106 | "", 107 | "", 108 | options, 109 | SinglePost, 110 | ); 111 | this.ids = ids; 112 | } 113 | 114 | /** 115 | * Get the post metadata 116 | */ 117 | protected async getNext() { 118 | for (const id of this.ids) { 119 | this.id = id; 120 | await this.postPage(id, 5); 121 | await this.sleep(2); 122 | } 123 | this.finished = true; 124 | } 125 | } 126 | 127 | const getPageValidator = (options: IOptions) => 128 | options.fullAPI ? FullApiPost : PostValidator; 129 | 130 | export type InstagramPostClass = Hashtag | User; 131 | export type InstagramFullPostClass = Hashtag | User; 132 | 133 | export function createApi( 134 | type: "search", 135 | query: string, 136 | options?: ISearchOptions | ISearchOptionsPlugins, 137 | ): Search; 138 | export function createApi(type: "post", id: string[], options?: IOptions): Post; 139 | export function createApi( 140 | type: "hashtag" | "user", 141 | id: string, 142 | options?: IOptionsRegular | IOptionsRegularPlugins, 143 | ): InstagramPostClass; 144 | export function createApi( 145 | type: "hashtag" | "user", 146 | id: string, 147 | options?: IOptionsFullApi | IOptionsFullApiPlugins, 148 | ): InstagramFullPostClass; 149 | 150 | export function createApi( 151 | type: "hashtag" | "user" | "post" | "search", 152 | id: string | string[], 153 | options?: IOptions, 154 | ): Post | InstagramPostClass | InstagramFullPostClass | Search { 155 | let ClassConstructor: typeof Hashtag | typeof User; 156 | switch (type) { 157 | case "search": 158 | return new Search(id as string, options as ISearchOptions); 159 | case "post": 160 | return new Post(id as string[], options); 161 | case "hashtag": 162 | ClassConstructor = Hashtag; 163 | break; 164 | case "user": 165 | ClassConstructor = User; 166 | break; 167 | } 168 | if (options.fullAPI) { 169 | return new ClassConstructor(id as string, options); 170 | } 171 | return new ClassConstructor(id as string, options); 172 | } 173 | 174 | /** 175 | * An Instagram hashtag API wrapper 176 | */ 177 | export class Hashtag extends Instagram { 178 | constructor(id: string, options: IOptions = {}) { 179 | super( 180 | "https://instagram.com/explore/tags/[id]", 181 | id, 182 | "data.hashtag.edge_hashtag_to_media.page_info", 183 | "data.hashtag.edge_hashtag_to_media.edges", 184 | options, 185 | getPageValidator(options), 186 | ); 187 | } 188 | } 189 | 190 | /** 191 | * An Instagram user API wrapper 192 | */ 193 | export class User extends Instagram { 194 | defaultPageFunctions = [ 195 | /* istanbul ignore next */ 196 | () => { 197 | let morePostsIntervalCounter = 0; 198 | const morePostsInterval = setInterval(() => { 199 | const searchDiv = Array.from( 200 | document.getElementsByTagName("div"), 201 | ).filter((d) => 202 | d.innerHTML.startsWith("Show More Posts from"), 203 | )[0]; 204 | 205 | morePostsIntervalCounter++; 206 | 207 | if (searchDiv !== undefined) { 208 | searchDiv.parentElement.parentElement.click(); 209 | clearInterval(morePostsInterval); 210 | } else if (morePostsIntervalCounter > 10) { 211 | clearInterval(morePostsInterval); 212 | } 213 | }, 1000); 214 | }, 215 | ]; 216 | 217 | constructor(id: string, options: IOptions = {}) { 218 | super( 219 | "https://instagram.com/[id]", 220 | id, 221 | "data.user.edge_owner_to_timeline_media.page_info", 222 | "data.user.edge_owner_to_timeline_media.edges", 223 | options, 224 | getPageValidator(options), 225 | ); 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/api/instagram.ts: -------------------------------------------------------------------------------- 1 | import AwaitLock from "await-lock"; 2 | import chalk from "chalk"; 3 | import {isLeft} from "fp-ts/lib/Either"; 4 | import {Type} from "io-ts"; 5 | import {PathReporter} from "io-ts/lib/PathReporter"; 6 | import {ThrowReporter} from "io-ts/lib/ThrowReporter"; 7 | import * as _ from "lodash/object"; 8 | import { 9 | Browser, 10 | Headers, 11 | launch, 12 | LaunchOptions, 13 | Page, 14 | Request, 15 | Response, 16 | } from "puppeteer"; 17 | import * as winston from "winston"; 18 | import { 19 | AsyncPluginEventsType, 20 | IPlugin, 21 | IPluginContext, 22 | PluginEventsType, 23 | SyncPluginEvents, 24 | SyncPluginEventsType, 25 | } from "../../plugins"; 26 | import {IOptions} from "./api"; 27 | import {PostIdSet} from "./postIdSet"; 28 | 29 | type AsyncPluginFunctions = { 30 | [key in AsyncPluginEventsType]: ((...args: any[]) => Promise)[]; 31 | }; 32 | type SyncPluginFunctions = { 33 | [key in SyncPluginEventsType]: ((...args: any[]) => void)[]; 34 | }; 35 | type PluginFunctions = AsyncPluginFunctions & SyncPluginFunctions; 36 | 37 | /** 38 | * Instagram API wrapper 39 | */ 40 | export class Instagram { 41 | /** 42 | * Apply defaults to undefined options 43 | */ 44 | private static defaultOptions(options: IOptions) { 45 | if (options.enableGrafting === undefined) { 46 | options.enableGrafting = true; 47 | } 48 | if (options.sameBrowser === undefined) { 49 | options.sameBrowser = false; 50 | } 51 | if (options.fullAPI === undefined) { 52 | options.fullAPI = false; 53 | } 54 | if (options.headless === undefined) { 55 | options.headless = true; 56 | } 57 | if (options.logger === undefined) { 58 | options.logger = winston.createLogger({ 59 | silent: true, 60 | }); 61 | } 62 | if (options.silent === undefined) { 63 | options.silent = true; 64 | } 65 | if (options.sleepTime === undefined) { 66 | options.sleepTime = 2; 67 | } 68 | if (options.hibernationTime === undefined) { 69 | options.hibernationTime = 60 * 20; 70 | } 71 | if (options.total === undefined) { 72 | options.total = 0; 73 | } 74 | return options; 75 | } 76 | 77 | // Resource identifier 78 | public id: string; 79 | public url: string; 80 | 81 | // Iteration state 82 | public started: boolean = false; 83 | public paused: boolean = false; 84 | public finished: boolean = false; 85 | public finishedReason: FinishedReasons; 86 | 87 | // Instagram URLs 88 | public catchURL: string = "https://www.instagram.com/graphql/query"; 89 | public postURL: string = "https://www.instagram.com/p/"; 90 | public defaultPostURL: string = "https://www.instagram.com/p/"; 91 | 92 | // Number of jumps before grafting 93 | public jumpMod: number = 100; 94 | 95 | // Depth of jumps 96 | public jumpSize: number = 2; 97 | 98 | // Puppeteer resources 99 | public page: Page; 100 | 101 | // Logging object 102 | public logger: winston.Logger; 103 | 104 | // Implementation-specific page functions 105 | public defaultPageFunctions: (() => void)[] = []; 106 | 107 | // Validations 108 | private readonly strict: boolean = false; 109 | private readonly validator: Type; 110 | 111 | // Puppeteer state 112 | private browser: Browser; 113 | private browserDisconnected: boolean = true; 114 | private readonly browserInstance?: Browser; 115 | private readonly headless: boolean; 116 | 117 | // Array of scraped posts and lock 118 | private postBuffer: PostType[] = []; 119 | private postBufferLock: AwaitLock = new AwaitLock(); 120 | 121 | // Request and Response buffers and locks 122 | private requestBuffer: Request[] = []; 123 | private requestBufferLock: AwaitLock = new AwaitLock(); 124 | private responseBuffer: Response[] = []; 125 | private responseBufferLock: AwaitLock = new AwaitLock(); 126 | 127 | // Get full amount of data from API 128 | private readonly fullAPI: boolean = false; 129 | private pagePromises: Promise[] = []; 130 | 131 | // Grafting state 132 | private readonly enableGrafting: boolean = true; 133 | private readonly sameBrowser: boolean = false; 134 | private graft: boolean = false; 135 | private graftURL: string = null; 136 | private graftHeaders: Headers = null; 137 | private foundGraft: boolean = false; 138 | 139 | // Hibernation due to rate limiting 140 | private hibernate: boolean = false; 141 | private readonly hibernationTime: number = 60 * 20; // 20 minutes 142 | 143 | // Number of jumps before exiting because lack of data 144 | private failedJumps: number = 20; 145 | private responseFromAPI: boolean = false; 146 | 147 | // Strings denoting the access methods of API objects 148 | private readonly pageQuery: string; 149 | private readonly edgeQuery: string; 150 | 151 | // Cache of post ids 152 | private postIds: PostIdSet; 153 | 154 | // Iteration variables 155 | private readonly total: number; 156 | private index: number = 0; 157 | private jumps: number = 0; 158 | 159 | // Number of times to attempt to visit url initially 160 | private readonly maxPageUrlAttempts = 3; 161 | private pageUrlAttempts = 0; 162 | private postPageRetries = 5; 163 | 164 | // Output 165 | private readonly silent: boolean = false; 166 | private writeLock: AwaitLock = new AwaitLock(); 167 | 168 | // Sleep time remaining 169 | private sleepRemaining: number = 0; 170 | 171 | // Length of time to sleep for 172 | private readonly sleepTime: number = 2; 173 | 174 | // Proxy for Instagram connection 175 | private readonly proxyURL: string; 176 | 177 | // Location of chromium / chrome binary executable 178 | private readonly executablePath: string; 179 | 180 | // Plugins to be run 181 | private pluginFunctions: PluginFunctions = { 182 | browser: [], 183 | construction: [], 184 | grafting: [], 185 | postPage: [], 186 | request: [], 187 | response: [], 188 | }; 189 | 190 | /** 191 | * Create API wrapper instance 192 | * @param endpoint the url for the type of resource to scrape 193 | * @param id the identifier for the resource 194 | * @param pageQuery the query to identify future pages in the nested API structure 195 | * @param edgeQuery the query to identify posts in the nested API structure 196 | * @param options configuration details 197 | * @param validator response type validator 198 | */ 199 | constructor( 200 | endpoint: string, 201 | id: string, 202 | pageQuery: string, 203 | edgeQuery: string, 204 | options: IOptions = {}, 205 | validator: Type, 206 | ) { 207 | this.id = id; 208 | this.postIds = new PostIdSet(); 209 | this.url = endpoint.replace("[id]", id); 210 | 211 | options = Instagram.defaultOptions(options); 212 | this.total = options.total; 213 | this.pageQuery = pageQuery; 214 | this.edgeQuery = edgeQuery; 215 | this.browserInstance = options.browserInstance; 216 | this.headless = options.headless; 217 | this.logger = options.logger; 218 | this.silent = options.silent; 219 | this.strict = options.strict; 220 | this.enableGrafting = options.enableGrafting; 221 | this.sameBrowser = options.sameBrowser; 222 | this.sleepTime = options.sleepTime; 223 | this.hibernationTime = options.hibernationTime; 224 | this.fullAPI = options.fullAPI; 225 | this.proxyURL = options.proxyURL; 226 | this.executablePath = options.executablePath; 227 | this.validator = options.validator || validator; 228 | 229 | this.addPlugins(options["plugins"]); 230 | this.executePlugins("construction"); 231 | } 232 | 233 | /** 234 | * Toggle pausing data collection 235 | */ 236 | public pause() { 237 | this.paused = !this.paused; 238 | } 239 | 240 | /** 241 | * Toggle prolonged pausing 242 | */ 243 | public toggleHibernation() { 244 | this.hibernate = true; 245 | } 246 | 247 | /** 248 | * Force the API to stop 249 | */ 250 | public async forceStop(force?: boolean) { 251 | if (!force && !this.started) { 252 | return; 253 | } 254 | this.started = false; 255 | this.finish(FinishedReasons.FORCED_STOP); 256 | try { 257 | this.requestBufferLock.release(); 258 | // tslint:disable-next-line: no-empty 259 | } catch (e) {} 260 | try { 261 | this.responseBufferLock.release(); 262 | // tslint:disable-next-line: no-empty 263 | } catch (e) {} 264 | await this.stop(); 265 | } 266 | 267 | /** 268 | * Generator of posts on page 269 | */ 270 | public async *generator(): AsyncIterableIterator { 271 | // Start if haven't done so already 272 | if (!this.started) { 273 | await this.start(); 274 | } 275 | 276 | while (true) { 277 | // Get more posts 278 | await this.getNext(); 279 | 280 | // Yield posts from buffer 281 | let post = await this.postPop(); 282 | while (post) { 283 | yield post; 284 | post = await this.postPop(); 285 | } 286 | 287 | // End loop when finished, check for pagePromises if fullAPI 288 | if (this.finished && this.pagePromises.length === 0) { 289 | break; 290 | } 291 | } 292 | await this.stop(); 293 | 294 | // Add newline to end of output 295 | if (!this.silent) { 296 | process.stdout.write("\n"); 297 | } 298 | } 299 | 300 | /** 301 | * Construct page and add listeners 302 | */ 303 | public async start() { 304 | let pageConstructed: boolean; 305 | this.pageUrlAttempts = 0; 306 | while (this.pageUrlAttempts++ < this.maxPageUrlAttempts) { 307 | pageConstructed = await this.constructPage(); 308 | if (pageConstructed) { 309 | break; 310 | } 311 | } 312 | if (!pageConstructed) { 313 | await this.forceStop(true); 314 | throw new Error("Failed to visit URL"); 315 | } 316 | 317 | // Build page and visit url 318 | await this.executePlugins("browser"); 319 | 320 | this.started = true; 321 | 322 | // Add event listeners for requests and responses 323 | await this.page.setRequestInterception(true); 324 | this.page.on("request", (req) => this.interceptRequest(req)); 325 | this.page.on("response", (res) => this.interceptResponse(res)); 326 | this.page.on("requestfailed", (res) => this.interceptFailure(res)); 327 | this.page.on("console", (message) => 328 | this.logger.info("Console log", {message}), 329 | ); 330 | 331 | // Ignore dialog boxes 332 | this.page.on("dialog", (dialog) => dialog.dismiss()); 333 | 334 | // Log errors 335 | /* istanbul ignore next */ 336 | this.page.on("error", (error) => 337 | this.logger.error("Console error", {error}), 338 | ); 339 | 340 | // Gather initial posts from web page 341 | if (this.fullAPI) { 342 | await this.scrapeDefaultPosts(); 343 | } 344 | } 345 | 346 | /** 347 | * Match the url to the url used in API requests 348 | */ 349 | public matchURL(url: string) { 350 | return url.startsWith(this.catchURL) && !url.includes("include_reel"); 351 | } 352 | 353 | /** 354 | * Close the page and browser 355 | */ 356 | protected async stop() { 357 | await this.progress(Progress.CLOSING); 358 | 359 | // Remove listeners 360 | if (!this.page.isClosed()) { 361 | this.page.removeAllListeners("request"); 362 | this.page.removeAllListeners("response"); 363 | this.page.removeAllListeners("requestfailed"); 364 | } 365 | 366 | // Clear request buffers 367 | await this.requestBufferLock.acquireAsync(); 368 | this.requestBuffer = []; 369 | this.requestBufferLock.release(); 370 | 371 | // Clear response buffers 372 | await this.responseBufferLock.acquireAsync(); 373 | this.responseBuffer = []; 374 | this.responseBufferLock.release(); 375 | 376 | // Wait for pagePromises to empty 377 | while (true) { 378 | if (this.pagePromises.length === 0) { 379 | break; 380 | } else { 381 | /* istanbul ignore next */ 382 | await this.sleep(1); 383 | } 384 | } 385 | 386 | // Close page 387 | if (!this.page.isClosed()) { 388 | await this.page.close(); 389 | } 390 | 391 | if (!this.browserDisconnected && !this.browserInstance) { 392 | await this.browser.close(); 393 | } 394 | } 395 | 396 | /** 397 | * Finish retrieving data for the generator 398 | */ 399 | protected finish(reason: FinishedReasons) { 400 | this.finished = true; 401 | this.finishedReason = reason; 402 | this.logger.info("Finished collecting", {reason}); 403 | } 404 | 405 | /** 406 | * Process the requests in the request buffer 407 | */ 408 | protected async processRequests() { 409 | await this.requestBufferLock.acquireAsync(); 410 | 411 | let newApiRequest = false; 412 | for (const req of this.requestBuffer) { 413 | // Match url 414 | if (!this.matchURL(req.url())) { 415 | continue; 416 | } else { 417 | newApiRequest = true; 418 | } 419 | 420 | // Begin grafting if required, else continue the request 421 | if (this.graft) { 422 | if (this.foundGraft === false) { 423 | // Gather details 424 | this.graftURL = req.url(); 425 | this.graftHeaders = req.headers(); 426 | this.foundGraft = true; 427 | 428 | // Cancel request 429 | await req.abort(); 430 | } else { 431 | // Swap request 432 | const overrides = { 433 | headers: this.graftHeaders, 434 | url: this.graftURL, 435 | }; 436 | await this.executePlugins("request", req, overrides); 437 | await req.continue(overrides); 438 | 439 | // Reset grafting data 440 | this.graft = false; 441 | this.foundGraft = false; 442 | this.graftURL = null; 443 | this.graftHeaders = null; 444 | } 445 | 446 | // Stop reading requests 447 | break; 448 | } else { 449 | const overrides = {}; 450 | this.executePlugins("request", req, overrides); 451 | await req.continue(overrides); 452 | } 453 | } 454 | 455 | // Clear buffer and release 456 | this.requestBuffer = []; 457 | this.requestBufferLock.release(); 458 | 459 | if (this.foundGraft && newApiRequest) { 460 | // Restart browser and page, clearing all buffers 461 | await this.stop(); 462 | await this.start(); 463 | } 464 | } 465 | 466 | /** 467 | * Process the responses in the response buffer 468 | */ 469 | protected async processResponses() { 470 | await this.responseBufferLock.acquireAsync(); 471 | 472 | for (const res of this.responseBuffer) { 473 | // Match url 474 | if (!this.matchURL(res.url())) { 475 | continue; 476 | } 477 | 478 | // Acknowledge receipt of response 479 | this.responseFromAPI = true; 480 | 481 | // Get JSON data 482 | let data: unknown; 483 | try { 484 | data = await res.json(); 485 | if (typeof data !== "object") { 486 | this.logger.error("Response data is not an object", {data}); 487 | continue; 488 | } 489 | } catch (error) { 490 | this.logger.error("Error processing response JSON", { 491 | data, 492 | error, 493 | }); 494 | continue; 495 | } 496 | 497 | // Emit event 498 | this.executePlugins("response", res, data); 499 | 500 | // Check for rate limiting 501 | if (data && "status" in data && data["status"] === "fail") { 502 | this.logger.info("Rate limited"); 503 | this.hibernate = true; 504 | continue; 505 | } 506 | 507 | // Check for next page 508 | if ( 509 | !( 510 | _.get(data, this.pageQuery + ".has_next_page", false) && 511 | _.get(data, this.pageQuery + ".end_cursor", false) 512 | ) 513 | ) { 514 | this.logger.info("No posts remaining", {data}); 515 | this.finish(FinishedReasons.API_FINISHED); 516 | } 517 | 518 | await this.processResponseData(data); 519 | } 520 | 521 | // Clear buffer and release 522 | this.responseBuffer = []; 523 | this.responseBufferLock.release(); 524 | } 525 | 526 | protected async processResponseData(data: unknown) { 527 | // Get posts 528 | const posts = _.get(data, this.edgeQuery, []); 529 | for (const post of posts) { 530 | const postId = post["node"]["id"]; 531 | 532 | // Check it hasn't already been cached 533 | const contains = this.postIds.add(postId); 534 | if (contains) { 535 | this.logger.info("Duplicate id found", {postId}); 536 | continue; 537 | } 538 | 539 | // Add to postBuffer 540 | if (this.index < this.total || this.total === 0) { 541 | this.index++; 542 | if (this.fullAPI) { 543 | this.pagePromises.push( 544 | this.postPage( 545 | post["node"]["shortcode"], 546 | this.postPageRetries, 547 | ), 548 | ); 549 | } else { 550 | await this.addToPostBuffer(post); 551 | } 552 | } else { 553 | this.finish(FinishedReasons.TOTAL_REACHED_API); 554 | break; 555 | } 556 | } 557 | } 558 | 559 | /** 560 | * Open a post in a new page, then extract its metadata 561 | */ 562 | protected async postPage(post: string, retries: number) { 563 | // Create page 564 | const postPage = await this.browser.newPage(); 565 | await postPage.setRequestInterception(true); 566 | postPage.on("request", async (req) => { 567 | if (!req.url().includes("/p/" + post)) { 568 | await req.abort(); 569 | } else { 570 | await req.continue(); 571 | } 572 | }); 573 | postPage.on("requestfailed", async (req) => this.interceptFailure(req)); 574 | 575 | // Visit post and read state 576 | let parsed; 577 | try { 578 | await postPage.goto(this.postURL + post + "/"); 579 | } catch (error) { 580 | await this.handlePostPageError( 581 | postPage, 582 | error, 583 | "Couldn't navigate to page", 584 | post, 585 | retries, 586 | ); 587 | return; 588 | } 589 | 590 | // Load data from memory 591 | let data; 592 | try { 593 | /* istanbul ignore next */ 594 | data = await postPage.evaluate(async () => { 595 | // Wait for _sharedData value to be set 596 | await new Promise((resolve) => { 597 | let i = 0; 598 | const findSharedData = setInterval(() => { 599 | if (window["_sharedData"] !== undefined || i++ > 5) { 600 | resolve(); 601 | clearInterval(findSharedData); 602 | } 603 | }, 2000); 604 | }); 605 | 606 | return JSON.stringify( 607 | window["_sharedData"].entry_data.PostPage[0].graphql, 608 | ); 609 | }); 610 | } catch (error) /* istanbul ignore next */ { 611 | await this.handlePostPageError( 612 | postPage, 613 | error, 614 | "Couldn't evaluate on page", 615 | post, 616 | retries, 617 | ); 618 | return; 619 | } 620 | 621 | // Close page 622 | await postPage.close(); 623 | 624 | // Parse data to PostType 625 | try { 626 | parsed = JSON.parse(data) as PostType; 627 | } catch (error) /* istanbul ignore next */ { 628 | await this.handlePostPageError( 629 | postPage, 630 | error, 631 | "Couldn't parse page data", 632 | post, 633 | retries, 634 | ); 635 | return; 636 | } 637 | 638 | await this.executePlugins("postPage", parsed); 639 | await this.addToPostBuffer(parsed); 640 | } 641 | 642 | private async handlePostPageError( 643 | page: Page, 644 | error: Error, 645 | message: string, 646 | post: string, 647 | retries: number, 648 | ) { 649 | // Log error and wait 650 | this.logger.error(message, {error}); 651 | await this.progress(Progress.ABORTED); 652 | await this.sleep(2); 653 | 654 | // Close existing attempt 655 | if (!page.isClosed()) { 656 | await page.close(); 657 | } 658 | 659 | // Retry 660 | if (retries > 0) { 661 | await this.postPage(post, --retries); 662 | } 663 | } 664 | 665 | protected async validatePost(post: PostType) { 666 | const validationResult = this.validator.decode(post); 667 | if (this.strict) { 668 | ThrowReporter.report(validationResult); 669 | return; 670 | } 671 | if (isLeft(validationResult)) { 672 | const validationReporter = PathReporter.report(validationResult); 673 | this.logger.warn( 674 | ` 675 | Warning! The Instagram API has been changed since this version of instamancer was released. 676 | More info: https://scriptsmith.github.io/instamancer/api-change 677 | `, 678 | {validationReporter, post}, 679 | ); 680 | } 681 | } 682 | 683 | /** 684 | * Stimulate the page until responses gathered 685 | */ 686 | protected async getNext() { 687 | await this.progress(Progress.SCRAPING); 688 | while (true) { 689 | // Process results (if any) 690 | await this.processRequests(); 691 | await this.processResponses(); 692 | 693 | // Finish page promises 694 | if (this.pagePromises.length > 0) { 695 | await this.progress(Progress.BRANCHING); 696 | await Promise.all(this.pagePromises); 697 | this.pagePromises = []; 698 | } 699 | 700 | // Check if finished 701 | if (this.finished) { 702 | break; 703 | } 704 | 705 | // Pause if paused 706 | await this.waitResume(); 707 | 708 | // Interact with page to stimulate request 709 | await this.jump(); 710 | 711 | // Stop if no data is being gathered 712 | if (this.jumps === this.failedJumps) { 713 | if (this.fullAPI) { 714 | if (!this.responseFromAPI) { 715 | this.finish(FinishedReasons.NO_RESPONSE); 716 | } 717 | } else if (this.index === 0) { 718 | this.finish(FinishedReasons.NO_INCREMENT); 719 | 720 | const pageContent = {content: ""}; 721 | try { 722 | pageContent.content = await this.page.content(); 723 | } catch (e) { 724 | // No content 725 | } 726 | 727 | this.logger.error( 728 | "Page failed to make requests", 729 | pageContent, 730 | ); 731 | break; 732 | } 733 | } 734 | 735 | // Enable grafting if required 736 | if (this.jumps % this.jumpMod === 0) { 737 | await this.initiateGraft(); 738 | } 739 | 740 | // Sleep 741 | await this.sleep(this.sleepTime); 742 | 743 | // Hibernate if rate-limited 744 | if (this.hibernate) { 745 | await this.sleep(this.hibernationTime); 746 | this.hibernate = false; 747 | } 748 | 749 | // Break if posts in buffer 750 | await this.postBufferLock.acquireAsync(); 751 | const posts = this.postBuffer.length; 752 | this.postBufferLock.release(); 753 | if (posts > 0) { 754 | break; 755 | } 756 | } 757 | } 758 | 759 | /** 760 | * Halt execution 761 | * @param time Seconds 762 | */ 763 | protected async sleep(time: number) { 764 | for (let i = time; i > 0; i--) { 765 | this.sleepRemaining = i; 766 | await this.progress(Progress.SCRAPING); 767 | 768 | await new Promise((resolve) => { 769 | setTimeout(resolve, i >= 1 ? 1000 : i * 1000); 770 | }); 771 | } 772 | this.sleepRemaining = 0; 773 | await this.progress(Progress.SCRAPING); 774 | } 775 | 776 | /** 777 | * Create the browser and page, then visit the url 778 | */ 779 | private async constructPage(): Promise { 780 | // Browser args 781 | const args = []; 782 | /* istanbul ignore if */ 783 | if (process.env.NO_SANDBOX) { 784 | args.push("--no-sandbox"); 785 | args.push("--disable-setuid-sandbox"); 786 | } 787 | if (this.proxyURL !== undefined) { 788 | args.push("--proxy-server=" + this.proxyURL); 789 | } 790 | 791 | // Browser launch options 792 | const options: LaunchOptions = { 793 | args, 794 | headless: this.headless, 795 | }; 796 | if (this.executablePath !== undefined) { 797 | options.executablePath = this.executablePath; 798 | } 799 | 800 | // Launch browser 801 | if (this.browserInstance) { 802 | await this.progress(Progress.LAUNCHING); 803 | this.browser = this.browserInstance; 804 | this.browserDisconnected = !this.browser.isConnected(); 805 | this.browser.on( 806 | "disconnected", 807 | () => (this.browserDisconnected = true), 808 | ); 809 | } else if (!this.sameBrowser || (this.sameBrowser && !this.started)) { 810 | await this.progress(Progress.LAUNCHING); 811 | this.browser = await launch(options); 812 | this.browserDisconnected = false; 813 | this.browser.on( 814 | "disconnected", 815 | () => (this.browserDisconnected = true), 816 | ); 817 | } 818 | 819 | // New page 820 | this.page = await this.browser.newPage(); 821 | await this.progress(Progress.OPENING); 822 | 823 | // Attempt to visit URL 824 | try { 825 | await this.page.goto(this.url); 826 | 827 | // Check page loads 828 | /* istanbul ignore next */ 829 | const pageLoaded = await this.page.evaluate(() => { 830 | const headings = document.querySelectorAll("h2"); 831 | for (const heading of Array.from(headings)) { 832 | if ( 833 | heading.innerHTML === 834 | "Sorry, this page isn't available." 835 | ) { 836 | return false; 837 | } 838 | } 839 | return true; 840 | }); 841 | if (!pageLoaded) { 842 | await this.handleConstructionError( 843 | "Page loaded with no content", 844 | 10, 845 | ); 846 | return false; 847 | } 848 | 849 | // Run defaultPagePlugins 850 | for (const f of this.defaultPageFunctions) { 851 | await this.page.evaluate(f); 852 | } 853 | 854 | // Fix issue with disabled scrolling 855 | /* istanbul ignore next */ 856 | await this.page.evaluate(() => { 857 | setInterval(() => { 858 | try { 859 | document.body.style.overflow = ""; 860 | } catch (error) { 861 | this.logger.error("Failed to update style", {error}); 862 | } 863 | }, 10000); 864 | }); 865 | } catch (e) { 866 | await this.handleConstructionError(e, 60); 867 | return false; 868 | } 869 | return true; 870 | } 871 | 872 | /*** 873 | * Handle errors that occur during page construction 874 | */ 875 | private async handleConstructionError(error: string, timeout: number) { 876 | // Log error and wait 877 | this.logger.error("Construction error", {error, url: this.url}); 878 | await this.progress(Progress.ABORTED); 879 | await this.sleep(timeout); 880 | 881 | // Close existing attempt 882 | if (!this.page.isClosed()) { 883 | await this.page.close(); 884 | } 885 | await this.browser.close(); 886 | } 887 | 888 | /** 889 | * Pause and wait until resumed 890 | */ 891 | private async waitResume() { 892 | // Pause for 200 milliseconds 893 | function f() { 894 | return new Promise((resolve) => { 895 | setTimeout(resolve, 200); 896 | }); 897 | } 898 | 899 | // Pause until pause toggled 900 | while (this.paused === true) { 901 | await this.progress(Progress.PAUSED); 902 | await f(); 903 | } 904 | } 905 | 906 | /** 907 | * Pop a post off the postBuffer (using locks). Returns null if no posts in buffer 908 | */ 909 | private async postPop() { 910 | let post = null; 911 | await this.postBufferLock.acquireAsync(); 912 | if (this.postBuffer.length > 0) { 913 | post = this.postBuffer.shift(); 914 | } 915 | this.postBufferLock.release(); 916 | return post; 917 | } 918 | 919 | /** 920 | * Print progress to stderr 921 | */ 922 | private async progress(state: Progress) { 923 | // End if silent 924 | if (this.silent) { 925 | return; 926 | } 927 | 928 | // Lock 929 | await this.writeLock.acquireAsync(); 930 | 931 | // Calculate total 932 | const total = this.total === 0 ? "Unlimited" : this.total; 933 | 934 | // Generate output string 935 | const idStr = chalk.bgYellow.black(` ${this.id} `); 936 | const totalStr = chalk.bgBlack(` Total: ${total} `); 937 | const stateStr = chalk.bgWhite.black(` State: ${state} `); 938 | const sleepStr = chalk.bgWhite.black( 939 | ` Sleeping: ${this.sleepRemaining} `, 940 | ); 941 | const indexStr = chalk.bgWhite.black(` Scraped: ${this.index} `); 942 | 943 | this.logger.debug({ 944 | id: this.id, 945 | index: this.index, 946 | sleepRemaining: this.sleepRemaining, 947 | state, 948 | total, 949 | }); 950 | 951 | // Print output 952 | process.stderr.write( 953 | `\r${idStr}${totalStr}${stateStr}${sleepStr}${indexStr}\u001B[K`, 954 | ); 955 | 956 | // Release 957 | this.writeLock.release(); 958 | } 959 | 960 | /** 961 | * Add request to the request buffer 962 | */ 963 | private async interceptRequest(req: Request) { 964 | await this.requestBufferLock.acquireAsync(); 965 | this.requestBuffer.push(req); 966 | await this.requestBufferLock.release(); 967 | } 968 | 969 | /** 970 | * Add the response to the response buffer 971 | */ 972 | private async interceptResponse(res: Response) { 973 | await this.responseBufferLock.acquireAsync(); 974 | this.responseBuffer.push(res); 975 | await this.responseBufferLock.release(); 976 | } 977 | 978 | /** 979 | * Log failed requests 980 | */ 981 | private async interceptFailure(req: Request) { 982 | this.logger.info("Failed request", {url: req.url()}); 983 | await this.progress(Progress.ABORTED); 984 | } 985 | 986 | /** 987 | * Add post to buffer 988 | */ 989 | private async addToPostBuffer(post: PostType) { 990 | await this.postBufferLock.acquireAsync(); 991 | await this.validatePost(post); 992 | this.postBuffer.push(post); 993 | this.postBufferLock.release(); 994 | } 995 | 996 | /** 997 | * Manipulate the page to stimulate a request 998 | */ 999 | private async jump() { 1000 | await this.page.keyboard.press("PageUp"); 1001 | const jumpSize = this.graft ? 1 : this.jumpSize; 1002 | for (let i = 0; i < jumpSize; i++) { 1003 | await this.page.keyboard.press("End"); 1004 | } 1005 | 1006 | // Move mouse randomly 1007 | const width = this.page.viewport()["width"]; 1008 | const height = this.page.viewport()["height"]; 1009 | await this.page.mouse.move( 1010 | Math.round(width * Math.random()), 1011 | Math.round(height * Math.random()), 1012 | ); 1013 | 1014 | ++this.jumps; 1015 | } 1016 | 1017 | /** 1018 | * Clear request and response buffers 1019 | */ 1020 | private async initiateGraft() { 1021 | // Check if enabled 1022 | if (!this.enableGrafting) { 1023 | return; 1024 | } 1025 | 1026 | await this.progress(Progress.GRAFTING); 1027 | 1028 | this.executePlugins("grafting"); 1029 | 1030 | // Enable grafting 1031 | this.graft = true; 1032 | } 1033 | 1034 | /** 1035 | * Read the posts that are pre-loaded on the page 1036 | */ 1037 | private async scrapeDefaultPosts() { 1038 | // Get shortcodes from page 1039 | /* istanbul ignore next */ 1040 | const shortCodes = await this.page.evaluate((url) => { 1041 | return Array.from(document.links) 1042 | .filter((link) => { 1043 | return ( 1044 | link.href.startsWith(url) && 1045 | link.href.split("/").length >= 2 1046 | ); 1047 | }) 1048 | .map((link) => { 1049 | const linkSplit = link.href.split("/"); 1050 | return linkSplit[linkSplit.length - 2]; 1051 | }); 1052 | }, this.defaultPostURL); 1053 | 1054 | // Add postPage promises 1055 | for (const shortCode of shortCodes) { 1056 | if (this.index < this.total || this.total === 0) { 1057 | this.index++; 1058 | this.pagePromises.push( 1059 | this.postPage(shortCode, this.postPageRetries), 1060 | ); 1061 | } else { 1062 | this.finish(FinishedReasons.TOTAL_REACHED_PAGE); 1063 | break; 1064 | } 1065 | } 1066 | } 1067 | 1068 | private addPlugins(plugins: IPlugin[]) { 1069 | if (!plugins) { 1070 | return; 1071 | } 1072 | 1073 | for (const plugin of plugins) { 1074 | for (const event of Object.keys(this.pluginFunctions)) { 1075 | const pluginEvent = plugin[event + "Event"]; 1076 | if (pluginEvent) { 1077 | const context: IPluginContext = { 1078 | plugin, 1079 | state: this, 1080 | }; 1081 | 1082 | this.pluginFunctions[event].push(pluginEvent.bind(context)); 1083 | } 1084 | } 1085 | } 1086 | } 1087 | 1088 | private executePlugins(event: SyncPluginEventsType, ...args): void; 1089 | private executePlugins( 1090 | event: AsyncPluginEventsType, 1091 | ...args 1092 | ): Promise; 1093 | private executePlugins(event: PluginEventsType, ...args) { 1094 | if (event in SyncPluginEvents) { 1095 | for (const pluginFunction of this.pluginFunctions["construction"]) { 1096 | pluginFunction(); 1097 | } 1098 | return; 1099 | } 1100 | 1101 | return Promise.all( 1102 | // @ts-ignore 1103 | this.pluginFunctions[event].map((cb) => cb(...args)), 1104 | ); 1105 | } 1106 | } 1107 | 1108 | /** 1109 | * The states of progress that the API can be in. Used to output status. 1110 | */ 1111 | enum Progress { 1112 | LAUNCHING = "Launching", 1113 | OPENING = "Navigating", 1114 | SCRAPING = "Scraping", 1115 | BRANCHING = "Branching", 1116 | GRAFTING = "Grafting", 1117 | CLOSING = "Closing", 1118 | 1119 | PAUSED = "Paused", 1120 | ABORTED = "Request aborted", 1121 | } 1122 | 1123 | /** 1124 | * Reasons why the collection finished 1125 | */ 1126 | enum FinishedReasons { 1127 | // forceStop used 1128 | FORCED_STOP, 1129 | 1130 | // API response doesn't contain next page 1131 | API_FINISHED, 1132 | 1133 | // Total posts required have been collected from the API 1134 | TOTAL_REACHED_API, 1135 | 1136 | // Total posts required have been collected from the default posts 1137 | TOTAL_REACHED_PAGE, 1138 | 1139 | // No API response intercepted after interacting with page 1140 | NO_RESPONSE, 1141 | 1142 | // Index hasn't increased after interacting with page 1143 | NO_INCREMENT, 1144 | } 1145 | -------------------------------------------------------------------------------- /src/api/postIdSet.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * A set of post ids used to detect duplicates 3 | */ 4 | export class PostIdSet { 5 | private ids: Set = new Set(); 6 | 7 | /** 8 | * Add a post id to the set. 9 | * @return true if the id was already in the set, false if not. 10 | */ 11 | public add(id: string): boolean { 12 | const contains = this.ids.has(id); 13 | this.ids.add(id); 14 | return contains; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/api/search.ts: -------------------------------------------------------------------------------- 1 | import * as t from "io-ts"; 2 | import {excess} from "io-ts-excess"; 3 | import {IPlugin} from "../../plugins"; 4 | import {IOptions} from "./api"; 5 | import {Instagram} from "./instagram"; 6 | 7 | export const Users = t.type({ 8 | position: t.number, 9 | user: excess( 10 | t.type({ 11 | full_name: t.string, 12 | account_badges: t.array(t.undefined), 13 | biography_product_mentions: t.array(t.undefined), 14 | has_anonymous_profile_picture: t.boolean, 15 | is_private: t.boolean, 16 | is_verified: t.boolean, 17 | latest_reel_media: t.number, 18 | mutual_followers_count: t.number, 19 | pk: t.string, 20 | profile_pic_id: t.union([t.string, t.undefined]), 21 | profile_pic_url: t.string, 22 | username: t.string, 23 | }), 24 | ), 25 | }); 26 | 27 | export const Places = t.type({ 28 | place: excess( 29 | t.type({ 30 | header_media: t.any, 31 | location: excess( 32 | t.type({ 33 | address: t.string, 34 | city: t.string, 35 | external_source: t.string, 36 | facebook_places_id: t.number, 37 | lat: t.union([t.undefined, t.number]), 38 | lng: t.union([t.undefined, t.number]), 39 | name: t.string, 40 | pk: t.string, 41 | short_name: t.string, 42 | }), 43 | ), 44 | media_bundles: t.UnknownArray, 45 | slug: t.string, 46 | subtitle: t.string, 47 | title: t.string, 48 | }), 49 | ), 50 | position: t.number, 51 | }); 52 | 53 | export const Hashtags = t.type({ 54 | hashtag: excess( 55 | t.type({ 56 | id: t.string, 57 | media_count: t.number, 58 | name: t.string, 59 | profile_pic_url: t.string, 60 | search_result_subtitle: t.string, 61 | use_default_avatar: t.boolean, 62 | }), 63 | ), 64 | position: t.number, 65 | }); 66 | 67 | export const SearchResult = t.type({ 68 | clear_client_cache: t.boolean, 69 | has_more: t.boolean, 70 | hashtags: t.array(Hashtags), 71 | places: t.array(Places), 72 | rank_token: t.string, 73 | status: t.string, 74 | users: t.array(Users), 75 | }); 76 | 77 | export type TSearchResult = t.TypeOf; 78 | 79 | export type ISearchOptions = Pick< 80 | IOptions, 81 | Exclude< 82 | keyof IOptions, 83 | "total" | "fullAPI" | "hibernationTime" | "sleepTime" 84 | > 85 | >; 86 | 87 | export interface ISearchOptionsPlugins extends ISearchOptions { 88 | plugins?: IPlugin[]; 89 | } 90 | 91 | export class Search extends Instagram { 92 | public readonly catchURL = "https://www.instagram.com/web/"; 93 | private searchResult: TSearchResult; 94 | private readonly searchQuery: string; 95 | private readonly inputElementQuery: string = "input[type='text']"; 96 | 97 | constructor(query: string, options: ISearchOptions = {}) { 98 | super( 99 | "https://instagram.com/explore/tags/instagram", 100 | "", 101 | "", 102 | "", 103 | options, 104 | SearchResult, 105 | ); 106 | this.searchQuery = query; 107 | } 108 | 109 | public async get() { 110 | if (!this.started) { 111 | await this.start(); 112 | } 113 | try { 114 | await this.page.waitForSelector(this.inputElementQuery, { 115 | timeout: 30000, 116 | }); 117 | } catch { 118 | // Timeout 119 | } 120 | await this.page.click(this.inputElementQuery); 121 | 122 | await this.page.keyboard.sendCharacter(this.searchQuery); 123 | await this.page.waitForRequest((req) => this.matchURL(req.url())); 124 | await this.processRequests(); 125 | await this.page.waitForResponse((res) => this.matchURL(res.url())); 126 | await this.processResponses(); 127 | await this.stop(); 128 | return this.searchResult; 129 | } 130 | 131 | public matchURL(url: string) { 132 | return url.startsWith(this.catchURL); 133 | } 134 | 135 | protected async processResponseData(data: TSearchResult) { 136 | await this.validatePost(data); 137 | this.searchResult = data; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/api/types.ts: -------------------------------------------------------------------------------- 1 | // tslint:disable: object-literal-sort-keys 2 | import * as t from "io-ts"; 3 | import {excess} from "io-ts-excess"; 4 | 5 | export const Location = t.type({ 6 | id: t.string, 7 | has_public_page: t.boolean, 8 | name: t.string, 9 | slug: t.string, 10 | address_json: t.union([t.string, t.undefined, t.null]), 11 | }); 12 | 13 | export const PostNodeOwner = t.type({ 14 | id: t.string, 15 | }); 16 | 17 | export const CommentNodeOwner = t.type({ 18 | id: t.string, 19 | is_verified: t.boolean, 20 | profile_pic_url: t.string, 21 | username: t.string, 22 | }); 23 | 24 | export const ShortcodeMediaOwner = t.type({ 25 | id: t.string, 26 | is_verified: t.boolean, 27 | profile_pic_url: t.string, 28 | username: t.string, 29 | blocked_by_viewer: t.boolean, 30 | followed_by_viewer: t.boolean, 31 | full_name: t.string, 32 | has_blocked_viewer: t.boolean, 33 | is_private: t.boolean, 34 | is_unpublished: t.boolean, 35 | requested_by_viewer: t.boolean, 36 | }); 37 | 38 | export const PageInfo = t.type({ 39 | has_next_page: t.boolean, 40 | end_cursor: t.union([t.string, t.null]), 41 | }); 42 | 43 | export const Dimensions = t.type({ 44 | height: t.number, 45 | width: t.number, 46 | }); 47 | 48 | export const Counter = t.type({ 49 | count: t.number, 50 | }); 51 | 52 | export const GatingInfo = t.type({ 53 | buttons: t.array(t.string), 54 | description: t.string, 55 | gating_type: t.string, 56 | title: t.string, 57 | }); 58 | 59 | export const DisplayResources = t.array( 60 | t.type({ 61 | src: t.string, 62 | config_width: t.number, 63 | config_height: t.number, 64 | }), 65 | ); 66 | 67 | export const EdgeMediaToCaptionNode = t.type({ 68 | text: t.union([t.string, t.undefined]), 69 | shortcode: t.union([t.string, t.undefined]), 70 | is_video: t.union([t.boolean, t.undefined]), 71 | video_url: t.union([t.string, t.undefined]), 72 | display_resources: t.union([DisplayResources, t.undefined]), 73 | }); 74 | 75 | export const EdgeMediaToCaption = t.type({ 76 | edges: t.array( 77 | t.type({ 78 | node: EdgeMediaToCaptionNode, 79 | }), 80 | ), 81 | }); 82 | 83 | export const RelatedProfile = t.type({ 84 | id: t.string, 85 | full_name: t.string, 86 | is_private: t.boolean, 87 | is_verified: t.boolean, 88 | profile_pic_url: t.string, 89 | username: t.string, 90 | edge_followed_by: t.type({ 91 | count: t.number, 92 | }), 93 | edge_owner_to_timeline_media: t.type({ 94 | count: t.number, 95 | edges: t.array( 96 | t.type({ 97 | node: t.type({ 98 | __typename: t.string, 99 | id: t.string, 100 | shortcode: t.string, 101 | edge_media_preview_like: Counter, 102 | edge_media_preview_comment: Counter, 103 | thumbnail_src: t.string, 104 | owner: t.type({ 105 | id: t.string, 106 | username: t.string, 107 | }), 108 | gating_info: t.union([GatingInfo, t.null, t.undefined]), 109 | is_video: t.boolean, 110 | accessibility_caption: t.union([t.string, t.null]), 111 | }), 112 | }), 113 | ), 114 | }), 115 | }); 116 | 117 | export const EdgeRelatedProfiles = t.type({ 118 | edges: t.array( 119 | t.type({ 120 | node: t.union([t.undefined, RelatedProfile]), 121 | }), 122 | ), 123 | }); 124 | 125 | const EdgeSidecarToChildren = t.type({ 126 | edges: t.array( 127 | t.type({ 128 | node: t.type({ 129 | __typename: t.string, 130 | id: t.string, 131 | shortcode: t.union([t.string, t.undefined]), 132 | dimensions: Dimensions, 133 | gating_info: t.union([t.null, t.undefined]), 134 | fact_check_information: t.union([t.null, t.undefined]), 135 | media_preview: t.union([t.undefined, t.string, t.null]), 136 | display_url: t.string, 137 | display_resources: DisplayResources, 138 | accessibility_caption: t.union([t.string, t.undefined, t.null]), 139 | is_video: t.boolean, 140 | video_url: t.union([t.string, t.undefined]), 141 | tracking_token: t.string, 142 | edge_media_to_tagged_user: EdgeMediaToCaption, 143 | }), 144 | }), 145 | ), 146 | }); 147 | 148 | export const PostNode = t.type({ 149 | __typename: t.union([t.string, t.undefined]), 150 | comments_disabled: t.boolean, 151 | location: t.union([t.null, t.undefined, Location]), 152 | id: t.string, 153 | edge_media_to_caption: EdgeMediaToCaption, 154 | shortcode: t.string, 155 | edge_media_to_comment: Counter, 156 | taken_at_timestamp: t.number, 157 | sensitivity_friction_info: t.union([GatingInfo, t.null, t.undefined]), 158 | media_overlay_info: t.union([t.null, t.undefined]), 159 | fact_check_information: t.union([t.null, t.undefined]), 160 | fact_check_overall_rating: t.union([t.undefined, t.null]), 161 | dimensions: Dimensions, 162 | display_url: t.string, 163 | edge_liked_by: t.union([Counter, t.undefined]), 164 | edge_media_preview_like: Counter, 165 | owner: PostNodeOwner, 166 | thumbnail_src: t.string, 167 | thumbnail_resources: t.union([DisplayResources, t.undefined]), 168 | is_video: t.boolean, 169 | accessibility_caption: t.union([t.string, t.undefined, t.null]), 170 | display_resources: t.union([DisplayResources, t.undefined]), 171 | should_log_client_event: t.union([t.undefined, t.boolean]), 172 | tracking_token: t.union([t.undefined, t.string]), 173 | edge_media_to_tagged_user: t.union([t.undefined, EdgeMediaToCaption]), 174 | edge_media_to_sponsor_user: t.union([t.undefined, EdgeMediaToCaption]), 175 | dash_info: t.union([ 176 | t.undefined, 177 | t.type({ 178 | is_dash_eligible: t.boolean, 179 | video_dash_manifest: t.null, 180 | number_of_qualities: t.number, 181 | }), 182 | ]), 183 | video_url: t.union([t.undefined, t.string]), 184 | video_view_count: t.union([t.undefined, t.number]), 185 | gating_info: t.union([t.null, t.undefined]), 186 | media_preview: t.union([t.undefined, t.string, t.null]), 187 | product_type: t.union([t.undefined, t.string]), 188 | viewer_has_liked: t.union([t.undefined, t.boolean]), 189 | viewer_has_saved: t.union([t.boolean, t.undefined]), 190 | viewer_has_saved_to_collection: t.union([t.boolean, t.undefined]), 191 | viewer_in_photo_of_you: t.union([t.boolean, t.undefined]), 192 | viewer_can_reshare: t.union([t.boolean, t.undefined]), 193 | edge_sidecar_to_children: t.union([EdgeSidecarToChildren, t.undefined]), 194 | }); 195 | 196 | export const CommentNode = t.type({ 197 | id: t.string, 198 | text: t.string, 199 | created_at: t.number, 200 | did_report_as_spam: t.boolean, 201 | owner: CommentNodeOwner, 202 | viewer_has_liked: t.boolean, 203 | edge_liked_by: Counter, 204 | }); 205 | 206 | export const EdgeMediaPreviewComment = t.type({ 207 | count: t.number, 208 | edges: t.array( 209 | t.type({ 210 | node: CommentNode, 211 | }), 212 | ), 213 | }); 214 | 215 | export const EdgeMediaHoistedComment = t.type({ 216 | edges: t.array( 217 | t.type({ 218 | node: CommentNode, 219 | }), 220 | ), 221 | }); 222 | 223 | const EdgeMediaToParentCommentNode = t.intersection([ 224 | CommentNode, 225 | t.type({ 226 | edge_threaded_comments: t.type({ 227 | count: t.number, 228 | page_info: PageInfo, 229 | edges: t.array( 230 | t.type({ 231 | node: CommentNode, 232 | }), 233 | ), 234 | }), 235 | }), 236 | ]); 237 | 238 | export const Post = t.type({ 239 | node: excess(PostNode), 240 | }); 241 | 242 | export const EdgeMediaToParentComment = t.type({ 243 | count: t.number, 244 | page_info: PageInfo, 245 | edges: t.array( 246 | t.type({ 247 | node: EdgeMediaToParentCommentNode, 248 | }), 249 | ), 250 | }); 251 | 252 | export const ShortcodeMedia = t.type({ 253 | __typename: t.string, 254 | id: t.string, 255 | shortcode: t.string, 256 | edge_media_to_comment: t.union([Counter, t.undefined]), 257 | thumbnail_src: t.union([t.undefined, t.string]), 258 | dimensions: Dimensions, 259 | gating_info: t.union([GatingInfo, t.null, t.undefined]), 260 | sensitivity_friction_info: t.union([GatingInfo, t.null, t.undefined]), 261 | fact_check_information: t.null, 262 | fact_check_overall_rating: t.union([t.undefined, t.null]), 263 | media_overlay_info: t.null, 264 | media_preview: t.union([t.string, t.null]), 265 | display_url: t.string, 266 | display_resources: DisplayResources, 267 | accessibility_caption: t.union([t.string, t.undefined, t.null]), 268 | is_video: t.boolean, 269 | should_log_client_event: t.union([t.boolean, t.undefined]), 270 | tracking_token: t.string, 271 | edge_media_to_tagged_user: EdgeMediaToCaption, 272 | edge_media_to_caption: EdgeMediaToCaption, 273 | caption_is_edited: t.boolean, 274 | has_ranked_comments: t.boolean, 275 | has_audio: t.union([t.boolean, t.undefined]), 276 | edge_media_to_parent_comment: t.union([ 277 | EdgeMediaToParentComment, 278 | t.undefined, 279 | ]), 280 | edge_media_to_hoisted_comment: t.union([ 281 | EdgeMediaHoistedComment, 282 | t.undefined, 283 | ]), 284 | edge_media_preview_comment: t.union([EdgeMediaPreviewComment, t.undefined]), 285 | edge_related_profiles: EdgeRelatedProfiles, 286 | comments_disabled: t.boolean, 287 | commenting_disabled_for_viewer: t.boolean, 288 | clips_music_attribution_info: t.union([t.null, t.undefined]), 289 | taken_at_timestamp: t.number, 290 | edge_media_preview_like: EdgeMediaPreviewComment, 291 | edge_media_to_sponsor_user: EdgeMediaToCaption, 292 | location: t.union([t.string, t.null]), 293 | viewer_has_liked: t.boolean, 294 | viewer_has_saved: t.boolean, 295 | viewer_has_saved_to_collection: t.boolean, 296 | viewer_in_photo_of_you: t.boolean, 297 | viewer_can_reshare: t.boolean, 298 | owner: ShortcodeMediaOwner, 299 | is_ad: t.boolean, 300 | edge_web_media_to_related_media: EdgeMediaToCaption, 301 | edge_sidecar_to_children: t.union([EdgeSidecarToChildren, t.undefined]), 302 | dash_info: t.union([ 303 | t.undefined, 304 | t.type({ 305 | is_dash_eligible: t.boolean, 306 | video_dash_manifest: t.null, 307 | number_of_qualities: t.number, 308 | }), 309 | ]), 310 | video_url: t.union([t.undefined, t.string]), 311 | video_view_count: t.union([t.undefined, t.number]), 312 | video_play_count: t.union([t.undefined, t.null, t.number]), 313 | encoding_status: t.union([t.undefined, t.string, t.null]), 314 | is_published: t.union([t.undefined, t.boolean]), 315 | product_type: t.union([t.undefined, t.string]), 316 | title: t.union([t.undefined, t.string, t.null]), 317 | video_duration: t.union([t.undefined, t.number]), 318 | }); 319 | 320 | export const SinglePost = t.type({ 321 | shortcode_media: excess(ShortcodeMedia), 322 | }); 323 | 324 | export const FullApiPost = t.type({ 325 | shortcode_media: excess( 326 | t.type({ 327 | ...ShortcodeMedia.props, 328 | location: t.union([Location, t.null]), 329 | }), 330 | ), 331 | }); 332 | 333 | // tslint:enable: object-literal-sort-keys 334 | 335 | export type TPost = t.TypeOf; 336 | 337 | export type TSinglePost = t.TypeOf; 338 | 339 | export type TFullApiPost = t.TypeOf; 340 | -------------------------------------------------------------------------------- /src/cli.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import * as aws from "aws-sdk"; 4 | import * as fs from "fs"; 5 | import * as readline from "readline"; 6 | import * as winston from "winston"; 7 | 8 | import * as path from "path"; 9 | import {v4 as uuid} from "uuid"; 10 | import * as plugins from "../plugins"; 11 | import {createApi, IOptions} from "./api/api"; 12 | import {TFullApiPost, TPost} from "./api/types"; 13 | import {GetPool} from "./getpool/getPool"; 14 | import * as depotUpload from "./http/depot"; 15 | import {download, toCSV, toJSON} from "./http/download"; 16 | import * as s3Upload from "./http/s3"; 17 | 18 | const getLogger = (args) => { 19 | const transports = []; 20 | if (args["logging"] !== "none") { 21 | transports.push( 22 | new winston.transports.File({ 23 | filename: args["logfile"], 24 | level: args["logging"], 25 | silent: args["logging"] === "none", 26 | }), 27 | ); 28 | } 29 | return winston.createLogger({ 30 | level: args["logging"], 31 | silent: args["logging"] === "none", 32 | transports, 33 | }); 34 | }; 35 | 36 | function getOptions(args, logger) { 37 | const options: IOptions = { 38 | enableGrafting: args["graft"], 39 | executablePath: args["browser"], 40 | fullAPI: args["full"], 41 | headless: !args["visible"], 42 | logger, 43 | plugins: [], 44 | sameBrowser: args["sameBrowser"], 45 | silent: args["quiet"], 46 | sleepTime: args["sleep"], 47 | strict: args["strict"], 48 | total: args["count"], 49 | }; 50 | 51 | for (const pluginName of args["plugin"]) { 52 | if (plugins.plugins[pluginName]) { 53 | options.plugins.push(new plugins.plugins[pluginName]()); 54 | } else { 55 | throw new Error("Couldn't find plugin " + pluginName); 56 | } 57 | } 58 | return options; 59 | } 60 | 61 | /** 62 | * Build argument parser 63 | */ 64 | function buildParser(args, callback) { 65 | /* tslint:disable:no-unused-expression */ 66 | require("yargs")(args) 67 | .usage("Usage: $0 [options]") 68 | .command("hashtag [id]", "Scrape a hashtag", {}, async (handleArgs) => { 69 | await spawn(handleArgs); 70 | callback(); 71 | }) 72 | .command( 73 | "user [id]", 74 | "Scrape a users posts", 75 | {}, 76 | async (handleArgs) => { 77 | await spawn(handleArgs); 78 | callback(); 79 | }, 80 | ) 81 | .command( 82 | "post [ids]", 83 | "Scrape a comma-separated list of posts", 84 | {}, 85 | async (handleArgs) => { 86 | await spawn(handleArgs); 87 | callback(); 88 | }, 89 | ) 90 | .command( 91 | "search [query]", 92 | "Perform a search of users, tags and places", 93 | {}, 94 | async (handleArgs) => { 95 | const logger = getLogger(handleArgs); 96 | const options = getOptions(handleArgs, logger); 97 | if (!handleArgs["query"]) { 98 | throw new Error("query required"); 99 | } 100 | const search = createApi( 101 | "search", 102 | handleArgs["query"], 103 | options, 104 | ); 105 | const result = await search.get(); 106 | process.stdout.write("\n"); 107 | process.stdout.write(JSON.stringify(result, null, 2)); 108 | process.stdout.write("\n"); 109 | callback(); 110 | }, 111 | ) 112 | .command( 113 | "batch [batchfile]", 114 | "Read newline-separated arguments from a file", 115 | {}, 116 | () => { 117 | // A list of functions which create new Promises that are 118 | // resolved by buildParser when the spawn commands are 119 | // finished 120 | // See https://stackoverflow.com/a/45951080/7435520 121 | const functions = []; 122 | 123 | // Read the list of commands from file 124 | readline 125 | .createInterface({ 126 | crlfDelay: Infinity, 127 | input: fs.createReadStream(args[1]), 128 | }) 129 | .on( 130 | "line", 131 | // For each line, create a new function which 132 | // creates a new promise to be resolved by 133 | // buildParser 134 | (line) => { 135 | if (line.length > 0 && line.charAt(0) !== "#") { 136 | functions.push( 137 | () => 138 | new Promise((res) => 139 | buildParser(line, res), 140 | ), 141 | ); 142 | } 143 | }, 144 | ) 145 | .on( 146 | "close", 147 | // When all lines have been read, synchronously 148 | // execute the commands by waiting for their 149 | // promises to be resolved 150 | async () => { 151 | for (const f of functions) { 152 | await f(); 153 | } 154 | process.exit(); 155 | }, 156 | ); 157 | }, 158 | ) 159 | /* tslint:disable:object-literal-sort-keys */ 160 | .options({ 161 | count: { 162 | alias: "c", 163 | number: true, 164 | default: 0, 165 | describe: "Number of posts to download (0 for all)", 166 | group: "Configuration", 167 | }, 168 | full: { 169 | alias: ["f"], 170 | boolean: true, 171 | default: false, 172 | describe: "Retrieve full post data", 173 | group: "Configuration", 174 | }, 175 | sleep: { 176 | alias: ["s"], 177 | number: true, 178 | default: 2, 179 | describe: "Seconds to sleep between interactions", 180 | group: "Configuration", 181 | }, 182 | graft: { 183 | alias: "g", 184 | boolean: true, 185 | default: true, 186 | describe: "Enable grafting", 187 | group: "Configuration", 188 | }, 189 | browser: { 190 | alias: ["b"], 191 | string: true, 192 | default: undefined, 193 | describe: "Browser path. Defaults to the puppeteer version", 194 | group: "Configuration", 195 | }, 196 | sameBrowser: { 197 | boolean: true, 198 | default: false, 199 | describe: "Use a single browser when grafting", 200 | group: "Configuration", 201 | }, 202 | download: { 203 | alias: "d", 204 | boolean: true, 205 | default: false, 206 | describe: "Save images from posts", 207 | group: "Download", 208 | }, 209 | downdir: { 210 | default: "downloads/[endpoint]/[id]", 211 | describe: "Download path", 212 | group: "Download", 213 | }, 214 | video: { 215 | alias: "v", 216 | boolean: true, 217 | default: false, 218 | describe: "Download videos (requires full)", 219 | implies: "full", 220 | group: "Download", 221 | }, 222 | sync: { 223 | boolean: true, 224 | default: false, 225 | describe: "Force download between requests", 226 | group: "Download", 227 | }, 228 | threads: { 229 | alias: "k", 230 | number: true, 231 | default: 4, 232 | describe: "Parallel download / depot threads", 233 | group: "Download", 234 | }, 235 | waitDownload: { 236 | alias: "w", 237 | boolean: true, 238 | default: false, 239 | describe: "Download media after scraping", 240 | group: "Download", 241 | }, 242 | bucket: { 243 | string: true, 244 | default: undefined, 245 | describe: "Upload files to an AWS S3 bucket", 246 | group: "Upload", 247 | }, 248 | depot: { 249 | string: true, 250 | default: undefined, 251 | describe: "Upload files to a URL with a PUT request (depot)", 252 | group: "Upload", 253 | }, 254 | file: { 255 | alias: ["o"], 256 | string: true, 257 | default: "[id]", 258 | describe: "Output filename. '-' for stdout", 259 | group: "Output", 260 | }, 261 | type: { 262 | alias: ["t"], 263 | default: "json", 264 | describe: "Filetype", 265 | choices: ["csv", "json", "both"], 266 | group: "Output", 267 | }, 268 | mediaPath: { 269 | alias: ["m"], 270 | boolean: true, 271 | default: false, 272 | describe: "Add filepaths to _mediaPath", 273 | group: "Output", 274 | }, 275 | visible: { 276 | boolean: true, 277 | default: false, 278 | describe: "Show browser on the screen", 279 | group: "Display", 280 | }, 281 | quiet: { 282 | alias: ["q"], 283 | boolean: true, 284 | default: false, 285 | describe: "Disable progress output", 286 | group: "Display", 287 | }, 288 | logging: { 289 | alias: ["l"], 290 | default: "none", 291 | choices: ["none", "error", "info", "debug"], 292 | group: "Logging", 293 | }, 294 | logfile: { 295 | string: true, 296 | default: "instamancer.log", 297 | describe: "Log file name", 298 | group: "Logging", 299 | }, 300 | strict: { 301 | boolean: true, 302 | default: false, 303 | describe: "Throw an error on response type mismatch", 304 | group: "Validation", 305 | }, 306 | plugin: { 307 | alias: ["p"], 308 | array: true, 309 | default: [], 310 | describe: "Use a plugin from the plugins directory", 311 | group: "Plugins", 312 | }, 313 | }) 314 | .demandCommand() 315 | .example( 316 | "$0 hashtag instagood -fvd", 317 | "Download all the available posts, and their media from #instagood", 318 | ) 319 | .example( 320 | "$0 user arianagrande --type=csv --logging=info --visible", 321 | "Download Ariana Grande's posts to a CSV file with a non-headless browser, and log all events", 322 | ) 323 | .epilog( 324 | "Source code available at https://github.com/ScriptSmith/instamancer", 325 | ) 326 | .strict().argv; 327 | /* tslint:enable:no-unused-expression */ 328 | } 329 | 330 | /** 331 | * Spawn an instance of the API 332 | * @param args 333 | */ 334 | async function spawn(args) { 335 | // Initiate logger 336 | const logger = getLogger(args); 337 | 338 | // Check id 339 | if (!(args["id"] || args["ids"])) { 340 | throw new Error("Id required"); 341 | } 342 | 343 | // Pick endpoint 344 | let ids; 345 | if (args["_"][0] === "post") { 346 | ids = args["ids"].split(","); 347 | args["id"] = ids.length === 1 ? ids[0] : "posts"; 348 | args["full"] = true; 349 | } else { 350 | ids = args["id"]; 351 | } 352 | 353 | // Define options 354 | const options: IOptions = getOptions(args, logger); 355 | 356 | // Replace downdir 357 | const downdir = args["downdir"] 358 | .replace("[id]", args["id"]) 359 | .replace("[endpoint]", args["_"]); 360 | 361 | // Replace depot url 362 | let depotUrl = args["depot"]; 363 | if (depotUrl && depotUrl.includes("[uuid]")) { 364 | depotUrl = depotUrl.replace("[uuid]", uuid()); 365 | if (!args["quiet"]) { 366 | process.stdout.write(depotUrl + "\n"); 367 | } 368 | } 369 | 370 | // Get s3 bucket 371 | const s3Bucket = args["bucket"]; 372 | 373 | // Check if outputting to stdout 374 | const printOutput = args["file"] === "-"; 375 | 376 | // Connect to object storage 377 | let downloadUpload; 378 | let toCSVFunc = toCSV; 379 | let toJSONFunc = toJSON; 380 | if (depotUrl) { 381 | // Depot 382 | const depotConfig = { 383 | directory: downdir, 384 | url: depotUrl, 385 | logger, 386 | }; 387 | 388 | downloadUpload = depotUpload.depot.bind(depotConfig); 389 | toCSVFunc = depotUpload.toCSV.bind(depotConfig); 390 | toJSONFunc = depotUpload.toJSON.bind(depotConfig); 391 | } else if (s3Bucket) { 392 | // s3 393 | const s3Config = { 394 | bucket: s3Bucket, 395 | directory: downdir, 396 | s3: new aws.S3(), 397 | logger, 398 | }; 399 | 400 | downloadUpload = s3Upload.s3.bind(s3Config); 401 | toCSVFunc = s3Upload.toCSV.bind(s3Config); 402 | toJSONFunc = s3Upload.toJSON.bind(s3Config); 403 | } else { 404 | // Download 405 | downloadUpload = download.bind({ 406 | directory: downdir, 407 | logger, 408 | }); 409 | } 410 | 411 | // Start API 412 | logger.info("Starting API at " + Date.now()); 413 | const obj = createApi(args["_"][0], ids, options); 414 | await obj.start(); 415 | 416 | // Start download pool 417 | const getPool = new GetPool(args["threads"], downloadUpload); 418 | 419 | // Pick between synchronous and parallel downloads 420 | const downloadFunction = args["sync"] 421 | ? downloadUpload 422 | : getPool.add.bind(getPool); 423 | 424 | // Add pause callback 425 | function handleKeypress(str, key) { 426 | if (key.name === "space") { 427 | obj.pause(); 428 | } else if (key.name === "c" && key.ctrl) { 429 | process.stdout.write("\n"); 430 | process.kill(process.pid, "SIGINT"); 431 | } 432 | } 433 | 434 | process.stdin.on("keypress", handleKeypress); 435 | 436 | // Array of urls and filenames 437 | let downloadMedia: [string, string, FILETYPES][] = []; 438 | 439 | // Download posts 440 | const posts = []; 441 | for await (const post of obj.generator()) { 442 | // Add _mediaPath key 443 | if (args["mediaPath"]) { 444 | post["_mediaPath"] = []; 445 | } 446 | 447 | // Identify download urls 448 | if (args["download"] && ("node" in post || "shortcode_media" in post)) { 449 | // Check the scraping level 450 | if (args["full"]) { 451 | // Check if album 452 | const postObject = post as TFullApiPost; 453 | const children = 454 | postObject.shortcode_media.edge_sidecar_to_children; 455 | if (children !== undefined) { 456 | for (const child of children.edges) { 457 | const shortcode = child.node.shortcode; 458 | 459 | // Check if video 460 | let mediaUrl: string; 461 | let mediaType: FILETYPES; 462 | if (child.node.is_video && args["video"]) { 463 | mediaUrl = child.node.video_url; 464 | mediaType = FILETYPES.VIDEO; 465 | } else { 466 | mediaUrl = child.node.display_resources.pop().src; 467 | mediaType = FILETYPES.IMAGE; 468 | } 469 | saveMediaMetadata( 470 | post, 471 | args, 472 | downloadMedia, 473 | downdir, 474 | mediaUrl, 475 | shortcode, 476 | mediaType, 477 | ); 478 | } 479 | } else { 480 | const shortcode = postObject.shortcode_media.shortcode; 481 | 482 | // Check if video 483 | let mediaUrl: string; 484 | let mediaType: FILETYPES; 485 | if (postObject.shortcode_media.is_video && args["video"]) { 486 | mediaUrl = postObject.shortcode_media.video_url; 487 | mediaType = FILETYPES.VIDEO; 488 | } else { 489 | mediaUrl = postObject.shortcode_media.display_resources.pop() 490 | .src; 491 | mediaType = FILETYPES.IMAGE; 492 | } 493 | saveMediaMetadata( 494 | post, 495 | args, 496 | downloadMedia, 497 | downdir, 498 | mediaUrl, 499 | shortcode, 500 | mediaType, 501 | ); 502 | } 503 | } else { 504 | const postObject = post as TPost; 505 | saveMediaMetadata( 506 | post, 507 | args, 508 | downloadMedia, 509 | downdir, 510 | postObject.node.thumbnail_src, 511 | postObject.node.shortcode, 512 | FILETYPES.IMAGE, 513 | ); 514 | } 515 | } 516 | 517 | // Output if required 518 | if (printOutput) { 519 | process.stdout.write(JSON.stringify(post, null, 2) + "\n"); 520 | } else { 521 | posts.push(post); 522 | } 523 | 524 | // Download the identified media 525 | if (!args["waitDownload"]) { 526 | for (const asset of downloadMedia) { 527 | await downloadFunction(...asset); 528 | } 529 | downloadMedia = []; 530 | } 531 | } 532 | 533 | // Download remaining media 534 | for (const asset of downloadMedia) { 535 | await downloadFunction(...asset); 536 | } 537 | 538 | // Close download pool 539 | await new Promise((resolve) => { 540 | getPool.close(resolve); 541 | }); 542 | await Promise.all(getPool.promises); 543 | 544 | // Replace filename 545 | const filename = args["file"] 546 | .replace("[id]", args["id"]) 547 | .replace("[endpoint]", args["_"]); 548 | 549 | // Save file 550 | if (!printOutput) { 551 | if (args["type"] !== "json") { 552 | let saveFile = filename; 553 | if (args["type"] === "both" || args["file"] === "[id]") { 554 | saveFile += ".csv"; 555 | } 556 | await toCSVFunc(posts, saveFile); 557 | } 558 | if (args["type"] !== "csv") { 559 | let saveFile = filename; 560 | if (args["type"] === "both" || args["file"] === "[id]") { 561 | saveFile += ".json"; 562 | } 563 | await toJSONFunc(posts, saveFile); 564 | } 565 | } 566 | 567 | // Remove pause callback 568 | process.stdin.removeAllListeners("keypress"); 569 | 570 | // Close logger 571 | logger.close(); 572 | } 573 | 574 | function saveMediaMetadata( 575 | post: object, 576 | args: object, 577 | downloadMedia: [string, string, FILETYPES][], 578 | downDir: string, 579 | url: string, 580 | shortcode: string, 581 | fileType: FILETYPES, 582 | ) { 583 | if (args["mediaPath"]) { 584 | let uri = path.join(downDir, shortcode + "." + fileType); 585 | uri = args["swift"] ? "swift://" + uri : uri; 586 | post["_mediaPath"].push(uri); 587 | } 588 | downloadMedia.push([url, shortcode, fileType]); 589 | } 590 | 591 | // Catch key presses 592 | readline.emitKeypressEvents(process.stdin); 593 | if ("setRawMode" in process.stdin) { 594 | process.stdin.setRawMode(true); 595 | } 596 | 597 | // Parse args 598 | buildParser(process.argv.slice(2), () => { 599 | process.exit(0); 600 | }); 601 | 602 | enum FILETYPES { 603 | VIDEO = "mp4", 604 | IMAGE = "jpg", 605 | } 606 | -------------------------------------------------------------------------------- /src/getpool/getPool.ts: -------------------------------------------------------------------------------- 1 | import * as winston from "winston"; 2 | 3 | class GetJob { 4 | public finished: boolean = false; 5 | private readonly url: string; 6 | private readonly name: string; 7 | private readonly extension: string; 8 | private readonly downloadUpload: ( 9 | url: string, 10 | name: string, 11 | extension: string, 12 | ) => Promise; 13 | 14 | constructor(url: string, name: string, extension: string, downloadUpload) { 15 | this.url = url; 16 | this.name = name; 17 | this.extension = extension; 18 | this.downloadUpload = downloadUpload; 19 | } 20 | 21 | public async start() { 22 | await this.downloadUpload(this.url, this.name, this.extension); 23 | this.finished = true; 24 | } 25 | } 26 | 27 | /** 28 | * A pool of jobs that only executes k jobs 'simultaneously' 29 | */ 30 | export class GetPool { 31 | // Job promises 32 | public promises: Array> = []; 33 | 34 | // Jobs that are currently being executed 35 | private runningJobs: GetJob[] = []; 36 | 37 | // Jobs that are yet to be executed 38 | private queuedJobs: GetJob[] = []; 39 | 40 | // Maximum number of jobs to be executed simultaneously 41 | private readonly maxConnections: number; 42 | 43 | // Looping interval executing promises 44 | private readonly loop; 45 | 46 | // Lock loop function execution 47 | private lock: boolean = false; 48 | 49 | // End-of-input signal triggered externally by close() 50 | private finished: boolean = false; 51 | 52 | // End-of-input resolve function 53 | private resolve: () => {}; 54 | 55 | // Download / Upload function 56 | private readonly downloadUpload: ( 57 | url: string, 58 | name: string, 59 | extension: string, 60 | directory: string, 61 | logger: winston.Logger, 62 | ) => Promise; 63 | 64 | constructor( 65 | connections: number = 1, 66 | downloadUpload: ( 67 | url: string, 68 | name: string, 69 | extension: string, 70 | ) => Promise, 71 | ) { 72 | this.maxConnections = connections; 73 | this.loop = setInterval(() => { 74 | this.poolLoop.bind(this)(); 75 | }, 100); 76 | this.downloadUpload = downloadUpload; 77 | } 78 | 79 | public add(url: string, name: string, extension: string) { 80 | this.queuedJobs.push( 81 | new GetJob(url, name, extension, this.downloadUpload), 82 | ); 83 | } 84 | 85 | public close(resolve) { 86 | this.finished = true; 87 | this.resolve = resolve; 88 | } 89 | 90 | private poolLoop() { 91 | // Obtain lock or cancel 92 | if (this.lock) { 93 | return; 94 | } else { 95 | this.lock = true; 96 | } 97 | 98 | // Remove finished jobs 99 | for (let i = 0; i < this.runningJobs.length; i++) { 100 | if (this.runningJobs[i].finished) { 101 | this.runningJobs.splice(i); 102 | i = 0; 103 | } 104 | } 105 | 106 | // Add new jobs to empty running slots 107 | while ( 108 | this.queuedJobs.length > 0 && 109 | this.runningJobs.length < this.maxConnections 110 | ) { 111 | const job = this.queuedJobs.shift(); 112 | this.promises.push(job.start()); 113 | this.runningJobs.push(job); 114 | } 115 | 116 | // End the interval when end-of-input signal given 117 | if ( 118 | this.finished && 119 | this.queuedJobs.length === 0 && 120 | this.runningJobs.length === 0 121 | ) { 122 | clearInterval(this.loop); 123 | this.resolve(); 124 | } 125 | 126 | // Release lock 127 | this.lock = false; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/http/depot.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import * as fs from "fs"; 3 | import * as path from "path"; 4 | import * as tmp from "tmp"; 5 | import {resolve, URL} from "url"; 6 | import * as winston from "winston"; 7 | import * as download from "./download"; 8 | 9 | interface IUpload { 10 | url: string; 11 | directory: string; 12 | logger: winston.Logger; 13 | } 14 | 15 | export async function depot( 16 | this: IUpload, 17 | url: string, 18 | name: string, 19 | extension: string, 20 | ) { 21 | try { 22 | // Axios download 23 | const downloadStream = await axios({ 24 | method: "GET", 25 | responseType: "stream", 26 | url, 27 | }); 28 | 29 | // Extract headers 30 | const contentType = downloadStream.headers["content-type"]; 31 | const contentLength = downloadStream.headers["content-length"]; 32 | 33 | // Upload path 34 | const filePath = path.join(this.directory, name + "." + extension); 35 | const uploadUrl = resolve(this.url, filePath); 36 | 37 | // Axios depot 38 | await axios({ 39 | data: downloadStream.data, 40 | headers: { 41 | "Content-Length": contentLength, 42 | "Content-Type": contentType, 43 | }, 44 | maxContentLength: Infinity, 45 | method: "PUT", 46 | ...authURL(uploadUrl), 47 | }).catch((error) => { 48 | this.logger.error(`Uploading ${url} failed`, error); 49 | }); 50 | } catch (e) { 51 | this.logger.error(`Uploading ${url} failed`, e); 52 | } 53 | } 54 | 55 | function authURL( 56 | url: string, 57 | ): {url: string; auth: {username: string; password: string}} { 58 | const components = new URL(url); 59 | const auth = { 60 | password: components.password, 61 | username: components.username, 62 | }; 63 | components.username = ""; 64 | components.password = ""; 65 | 66 | return { 67 | auth, 68 | url: components.toString(), 69 | }; 70 | } 71 | 72 | async function uploadFile( 73 | this: IUpload, 74 | posts: object[], 75 | filePath: string, 76 | fileFunc: (posts: object[], filePath: string) => Promise, 77 | contentType: string, 78 | ) { 79 | // Create tmp file 80 | const tmpFile = tmp.fileSync({keep: true}); 81 | 82 | // Dump posts to file 83 | await fileFunc(posts, tmpFile.name); 84 | 85 | // Read file to a stream 86 | const fileStream = fs.createReadStream(tmpFile.name); 87 | const contentLength = fs.statSync(tmpFile.name).size; 88 | 89 | // Upload file 90 | const uploadUrl = resolve(this.url, filePath); 91 | await axios({ 92 | data: fileStream, 93 | headers: { 94 | "Content-Length": contentLength, 95 | "Content-Type": contentType, 96 | }, 97 | maxContentLength: Infinity, 98 | method: "PUT", 99 | url: uploadUrl, 100 | }); 101 | 102 | // Delete file 103 | fs.unlinkSync(tmpFile.name); 104 | } 105 | 106 | /** 107 | * Upload list of posts to a CSV file 108 | */ 109 | export async function toCSV(this: IUpload, posts: object[], filePath: string) { 110 | const uploader = uploadFile.bind(this); 111 | await uploader(posts, filePath, download.toCSV, "text/csv"); 112 | } 113 | 114 | /** 115 | * Upload list of posts to a JSON file 116 | */ 117 | export async function toJSON(this: IUpload, posts: object[], filePath: string) { 118 | const uploader = uploadFile.bind(this); 119 | await uploader(posts, filePath, download.toJSON, "text/json"); 120 | } 121 | -------------------------------------------------------------------------------- /src/http/download.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import * as fs from "fs"; 3 | import {Parser, transforms} from "json2csv"; 4 | import * as winston from "winston"; 5 | 6 | interface IDownload { 7 | directory: string; 8 | logger: winston.Logger; 9 | } 10 | 11 | /** 12 | * Download file 13 | * @param url The URL of the file 14 | * @param name The name used to identify the file 15 | * @param extension The file extension (eg. ".jpg" or ".mp4") 16 | */ 17 | export async function download( 18 | this: IDownload, 19 | url: string, 20 | name: string, 21 | extension: string, 22 | ) { 23 | await new Promise((resolve) => { 24 | fs.mkdir(this.directory, {recursive: true}, resolve); 25 | }); 26 | try { 27 | // Get data 28 | const response = await axios({ 29 | method: "get", 30 | responseType: "stream", 31 | url, 32 | }); 33 | 34 | // Write to file 35 | await new Promise(async (resolve) => { 36 | const stream = fs.createWriteStream( 37 | this.directory + "/" + name + "." + extension, 38 | ); 39 | // noinspection TypeScriptValidateJSTypes 40 | response.data.pipe(stream); 41 | stream.on("finish", resolve); 42 | }); 43 | } catch (e) { 44 | this.logger.info(`Downloading ${url} failed`); 45 | this.logger.debug(e); 46 | } 47 | } 48 | 49 | /** 50 | * Save list of posts to a CSV file 51 | */ 52 | export async function toCSV(posts: object[], filePath: string) { 53 | const parser = new Parser({transforms: [transforms.flatten()]}); 54 | const csv = parser.parse(posts); 55 | fs.writeFileSync(filePath, csv); 56 | } 57 | 58 | /** 59 | * Save list of posts to a JSON file 60 | */ 61 | export async function toJSON(posts: object[], filePath: string) { 62 | let first = true; 63 | fs.writeFileSync(filePath, "["); 64 | for (const post of posts) { 65 | if (first) { 66 | first = false; 67 | } else { 68 | fs.appendFileSync(filePath, ", "); 69 | } 70 | fs.appendFileSync(filePath, JSON.stringify(post)); 71 | } 72 | fs.appendFileSync(filePath, "]"); 73 | } 74 | -------------------------------------------------------------------------------- /src/http/s3.ts: -------------------------------------------------------------------------------- 1 | import * as aws from "aws-sdk"; 2 | import axios from "axios"; 3 | import * as fs from "fs"; 4 | import * as tmp from "tmp"; 5 | import * as winston from "winston"; 6 | import * as download from "./download"; 7 | 8 | interface IUpload { 9 | bucket: string; 10 | directory: string; 11 | s3: aws.S3; 12 | logger: winston.Logger; 13 | } 14 | 15 | export async function s3( 16 | this: IUpload, 17 | url: string, 18 | name: string, 19 | extension: string, 20 | ) { 21 | try { 22 | // Axios download 23 | const downloadStream = await axios({ 24 | method: "GET", 25 | responseType: "stream", 26 | url, 27 | }); 28 | 29 | // Extract headers 30 | const contentType = downloadStream.headers["content-type"]; 31 | const contentLength = downloadStream.headers["content-length"]; 32 | 33 | // s3 upload 34 | await new Promise((resolve) => { 35 | this.s3.upload( 36 | { 37 | Body: downloadStream.data, 38 | Bucket: this.bucket, 39 | ContentLength: contentLength, 40 | ContentType: contentType, 41 | Key: this.directory + "/" + name + "." + extension, 42 | }, 43 | (err) => { 44 | if (err !== null) { 45 | this.logger.error(`Uploading ${url} failed`, err); 46 | } 47 | resolve(); 48 | }, 49 | ); 50 | }); 51 | } catch (e) { 52 | this.logger.error(`Uploading ${url} failed`, e); 53 | } 54 | } 55 | 56 | async function uploadFile( 57 | this: IUpload, 58 | posts: object[], 59 | filePath: string, 60 | fileFunc: (posts: object[], filePath: string) => Promise, 61 | contentType: string, 62 | ) { 63 | // Create tmp file 64 | const tmpFile = tmp.fileSync({keep: true}); 65 | 66 | // Dump posts to file 67 | await fileFunc(posts, tmpFile.name); 68 | 69 | // Read file to a stream 70 | const fileStream = fs.createReadStream(tmpFile.name); 71 | const contentLength = fs.statSync(tmpFile.name).size; 72 | 73 | // s3 upload 74 | await new Promise((resolve) => { 75 | this.s3.upload( 76 | { 77 | Body: fileStream, 78 | Bucket: this.bucket, 79 | ContentLength: contentLength, 80 | ContentType: contentType, 81 | Key: filePath, 82 | }, 83 | (err) => { 84 | if (err !== null) { 85 | this.logger.error(`Uploading ${filePath} failed`, err); 86 | } 87 | resolve(); 88 | }, 89 | ); 90 | }); 91 | 92 | // Delete file 93 | fs.unlinkSync(tmpFile.name); 94 | } 95 | 96 | /** 97 | * Upload list of posts to a CSV file 98 | */ 99 | export async function toCSV(this: IUpload, posts: object[], filePath: string) { 100 | const uploader = uploadFile.bind(this); 101 | await uploader(posts, filePath, download.toCSV, "text/csv"); 102 | } 103 | 104 | /** 105 | * Upload list of posts to a JSON file 106 | */ 107 | export async function toJSON(this: IUpload, posts: object[], filePath: string) { 108 | const uploader = uploadFile.bind(this); 109 | await uploader(posts, filePath, download.toJSON, "text/json"); 110 | } 111 | -------------------------------------------------------------------------------- /tests/__fixtures__/FakePage.ts: -------------------------------------------------------------------------------- 1 | import * as t from "io-ts"; 2 | import {IOptions} from "../../src/api/api"; 3 | import {Instagram} from "../../src/api/instagram"; 4 | 5 | export interface IFakePageOptions { 6 | // The path on the server 7 | path?: string; 8 | 9 | // The port the server is hosted on 10 | port?: number; 11 | 12 | // The query to get API pages 13 | pageQuery?: string; 14 | 15 | // The query to get posts 16 | edgeQuery?: string; 17 | 18 | // The page to catch api requests on 19 | catchPage?: string; 20 | 21 | // The page to visit posts 22 | postPage?: string; 23 | 24 | // Regular API options 25 | options?: IOptions; 26 | } 27 | 28 | const FakeValidator = t.type({ 29 | node: t.type({ 30 | id: t.string, 31 | }), 32 | }); 33 | 34 | export class FakePage extends Instagram> { 35 | constructor(options: IFakePageOptions = {path: "", port: 0}) { 36 | let baseURL = "http://127.0.0.1:" + options.port; 37 | if (options.path) { 38 | baseURL += options.path; 39 | } 40 | 41 | const silentOptions: IOptions = {silent: true}; 42 | super( 43 | baseURL, 44 | "", 45 | options.pageQuery, 46 | options.edgeQuery, 47 | { 48 | ...options.options, 49 | ...silentOptions, 50 | }, 51 | FakeValidator, 52 | ); 53 | 54 | this.catchURL = baseURL + "/" + options.catchPage; 55 | this.postURL = baseURL + "/" + options.postPage; 56 | 57 | setTimeout(async () => { 58 | await this.forceStop(); 59 | }, 30000); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /tests/__fixtures__/QuickGraft.ts: -------------------------------------------------------------------------------- 1 | import {Hashtag, IOptions} from "../../src/api/api"; 2 | 3 | export class QuickGraft extends Hashtag<{}> { 4 | constructor(id: string, options: IOptions = {}) { 5 | super(id, options); 6 | this.jumpMod = 2; 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /tests/server.ts: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import {AddressInfo} from "net"; 3 | 4 | const app = express(); 5 | 6 | app.get("/", (req, res) => { 7 | res.send(` 8 | 9 | 27 | `); 28 | }); 29 | 30 | app.get("/rate_limit", (req, res) => { 31 | res.send( 32 | JSON.stringify({ 33 | status: "fail", 34 | }), 35 | ); 36 | }); 37 | 38 | app.get("/invalid_json", (req, res) => { 39 | res.send("invalid"); 40 | }); 41 | 42 | app.get("/non_object", (req, res) => { 43 | res.send("1"); 44 | }); 45 | 46 | app.get("/no_next_page", (req, res) => { 47 | res.send( 48 | JSON.stringify({ 49 | data: { 50 | end_cursor: "cursor", 51 | has_next_page: false, 52 | }, 53 | }), 54 | ); 55 | }); 56 | 57 | app.get("/duplicate_ids", (req, res) => { 58 | res.send( 59 | JSON.stringify({ 60 | data: { 61 | edges: [ 62 | { 63 | node: { 64 | id: "1", 65 | }, 66 | }, 67 | { 68 | node: { 69 | id: "1", 70 | }, 71 | }, 72 | ], 73 | end_cursor: "cursor", 74 | has_next_page: true, 75 | }, 76 | }), 77 | ); 78 | }); 79 | 80 | app.get("/invalid_id", (req, res) => { 81 | res.send( 82 | JSON.stringify({ 83 | data: { 84 | edges: [ 85 | { 86 | node: { 87 | id: "badid", 88 | }, 89 | }, 90 | ], 91 | end_cursor: "cursor", 92 | has_next_page: false, 93 | }, 94 | }), 95 | ); 96 | }); 97 | 98 | app.get("/invalid_page", (req, res) => { 99 | res.send("

Sorry, this page isn't available.

"); 100 | }); 101 | 102 | let listener; 103 | 104 | export async function startServer(): Promise { 105 | await new Promise((resolve) => { 106 | listener = app.listen(0, resolve); 107 | }); 108 | 109 | return (listener.address() as AddressInfo).port; 110 | } 111 | 112 | export async function stopServer() { 113 | await new Promise((resolve) => { 114 | listener.close(resolve); 115 | }); 116 | } 117 | -------------------------------------------------------------------------------- /tests/test.spec.ts: -------------------------------------------------------------------------------- 1 | import * as t from "io-ts"; 2 | import {launch, Overrides, Request} from "puppeteer"; 3 | import * as winston from "winston"; 4 | import {createApi, IPlugin} from ".."; 5 | import {plugins} from ".."; 6 | import {IPluginContext} from "../plugins"; 7 | import {IOptions, IOptionsFullApi} from "../src/api/api"; 8 | import {FakePage, IFakePageOptions} from "./__fixtures__/FakePage"; 9 | import {QuickGraft} from "./__fixtures__/QuickGraft"; 10 | import {startServer, stopServer} from "./server"; 11 | 12 | jest.setTimeout(8 * 60 * 1000); 13 | /* tslint:disable:no-console */ 14 | 15 | const hashtags = ["beach", "gym", "puppies", "party", "throwback"]; 16 | const users = ["snoopdogg", "arianagrande", "bbc", "whitehouse", "australia"]; 17 | const posts = [ 18 | "By54GDoHGzK", 19 | "Be3rTNplCHf", 20 | "BlBvw2_jBKp", 21 | "Bzi33wDnxOz", 22 | "BfzEfy-lK1N", 23 | "Bneu_dCHVdn", 24 | "Brx-adXA9C1", 25 | "Bz5flRagYQt", 26 | "BmRZH7NFwi6", 27 | "BpiIJCUnYwy", 28 | ]; 29 | 30 | let smallSize = 10; 31 | let mediumSize = 100; 32 | let largeSize = 1000; 33 | 34 | // Run faster unless executing in CI 35 | if (!process.env.CI) { 36 | smallSize /= 10; 37 | mediumSize /= 10; 38 | largeSize /= 10; 39 | } 40 | 41 | const browserPath = process.env.CHROME 42 | ? process.env.CHROME 43 | : "/usr/bin/google-chrome"; 44 | 45 | // Name of an account with 0 posts to test graceful exit 46 | const emptyAccountName = "emptyaccount"; 47 | 48 | const createLogger = () => 49 | winston.createLogger({ 50 | format: winston.format.json(), 51 | level: "debug", 52 | silent: false, 53 | transports: [ 54 | new winston.transports.File({ 55 | filename: "instamancer_tests.log", 56 | level: "debug", 57 | }), 58 | new winston.transports.Console({ 59 | level: "error", 60 | }), 61 | ], 62 | }); 63 | const testWrapperLogger = createLogger(); 64 | 65 | const libraryTestOptions: IOptions = { 66 | logger: createLogger(), 67 | silent: true, 68 | strict: true, 69 | total: 10, 70 | }; 71 | 72 | /** 73 | * Used to debug stalled builds in travis 74 | * @param name Test name 75 | * @param callback Test function 76 | */ 77 | function testWrapper(name: string, callback: () => Promise) { 78 | test(name, async () => { 79 | const logSignPost = `JEST: Testing ${name}`; 80 | if (process.env.CI) { 81 | console.log(logSignPost); 82 | testWrapperLogger.info(logSignPost); 83 | } 84 | 85 | await callback(); 86 | }); 87 | } 88 | 89 | describe("Library Classes", () => { 90 | const total = 10; 91 | const objects = { 92 | hashtag: createApi("hashtag", hashtags[0], libraryTestOptions), 93 | post: createApi("post", posts, libraryTestOptions), 94 | user: createApi("user", users[0], libraryTestOptions), 95 | }; 96 | 97 | for (const [key, object] of Object.entries(objects)) { 98 | testWrapper(key, async () => { 99 | const scraped = []; 100 | for await (const post of object.generator()) { 101 | expect(post).toBeDefined(); 102 | scraped.push(post); 103 | } 104 | expect(scraped.length).toBe(total); 105 | }); 106 | } 107 | }); 108 | 109 | describe("Library Functions", () => { 110 | const total = 10; 111 | const generators = { 112 | hashtag: createApi( 113 | "hashtag", 114 | hashtags[0], 115 | libraryTestOptions, 116 | ).generator(), 117 | post: createApi("post", posts, libraryTestOptions).generator(), 118 | user: createApi("user", users[0], libraryTestOptions).generator(), 119 | }; 120 | 121 | for (const [key, generator] of Object.entries(generators)) { 122 | testWrapper(key, async () => { 123 | const scraped = []; 124 | for await (const post of generator) { 125 | expect(post).toBeDefined(); 126 | scraped.push(post); 127 | } 128 | expect(scraped.length).toBe(total); 129 | }); 130 | } 131 | }); 132 | 133 | describe("Full API", () => { 134 | const total = 10; 135 | const fullApiOption: IOptionsFullApi = { 136 | ...libraryTestOptions, 137 | fullAPI: true, 138 | }; 139 | const generators = { 140 | hashtag: createApi("hashtag", hashtags[0], fullApiOption).generator(), 141 | post: createApi("post", posts, fullApiOption).generator(), 142 | user: createApi("user", users[0], fullApiOption).generator(), 143 | }; 144 | 145 | for (const [key, generator] of Object.entries(generators)) { 146 | testWrapper(key, async () => { 147 | const scraped = []; 148 | for await (const post of generator) { 149 | expect(post).toBeDefined(); 150 | scraped.push(post); 151 | } 152 | expect(scraped.length).toBe(total); 153 | }); 154 | } 155 | }); 156 | 157 | testWrapper("Account with < 10 photos", async () => { 158 | // This is a not well-known account and it can be deleted at any moment 159 | // If this test starts to fail, need to find another user 160 | // which has less then 10 photos 161 | const id = "zhiznizmelochei"; 162 | const fullApiOption: IOptionsFullApi = { 163 | ...libraryTestOptions, 164 | fullAPI: true, 165 | }; 166 | const api = createApi("user", id, fullApiOption); 167 | const scraped = []; 168 | for await (const post of api.generator()) { 169 | expect(post).toBeDefined(); 170 | scraped.push(post); 171 | } 172 | expect(scraped.length).toBeGreaterThan(0); 173 | // If this user will start to do new posts 174 | // Need to find a new one 175 | expect(scraped.length).toBeLessThan(10); 176 | }); 177 | 178 | describe("API limits", () => { 179 | class ApiTestConditions { 180 | public api: "hashtag" | "user"; 181 | public ids: string[]; 182 | public sizes: number[]; 183 | 184 | constructor(api: "hashtag" | "user", ids: string[], sizes: number[]) { 185 | this.api = api; 186 | this.ids = ids; 187 | this.sizes = sizes; 188 | } 189 | } 190 | 191 | const endpoints: ApiTestConditions[] = [ 192 | new ApiTestConditions("hashtag", hashtags, [largeSize]), 193 | new ApiTestConditions("user", users, [mediumSize]), 194 | ]; 195 | 196 | for (const endpoint of endpoints) { 197 | // Get params 198 | const sourceApi = endpoint.api; 199 | const ids = endpoint.ids; 200 | const sizes = endpoint.sizes; 201 | 202 | for (const size of sizes) { 203 | // Decide how many ids to test based on size 204 | let sizeIds; 205 | let splitLen = 5; 206 | if (size === mediumSize) { 207 | splitLen = 3; 208 | } else if (size === largeSize) { 209 | splitLen = 1; 210 | } 211 | sizeIds = ids.slice(0, splitLen); 212 | 213 | for (const id of sizeIds) { 214 | testWrapper(`${endpoint.api} ${id} ${size}`, async () => { 215 | // Specify API options 216 | const options: IOptions = { 217 | enableGrafting: true, 218 | fullAPI: false, 219 | headless: true, 220 | logger: createLogger(), 221 | silent: false, 222 | sleepTime: 2, 223 | strict: true, 224 | total: size, 225 | }; 226 | 227 | // Create API 228 | const api = createApi(sourceApi, id, options); 229 | 230 | // Get posts 231 | const scraped = []; 232 | const postIds = new Set(); 233 | for await (const post of api.generator()) { 234 | postIds.add(post.node.id); 235 | scraped.push(post); 236 | } 237 | 238 | // Assert sizes 239 | expect(scraped.length).toBe(size); 240 | 241 | // Check duplicates 242 | expect(scraped.length).toBe(postIds.size); 243 | }); 244 | } 245 | } 246 | } 247 | }); 248 | 249 | describe("API options", () => { 250 | const hashtagId = "vetinari"; 251 | const total = 50; 252 | const optionsCollection: [string, IOptions][] = [ 253 | ["No options", {}], 254 | ["Silence", {silent: true, total}], 255 | ["Sleep", {sleepTime: 5, total}], 256 | ["Headless", {headless: false, total}], 257 | ["Grafting", {enableGrafting: false, total}], 258 | ["Executable path", {executablePath: browserPath, total}], 259 | ["Full api", {fullAPI: true, total}], 260 | ["Limited full api", {fullAPI: true, total: 5}], 261 | ]; 262 | 263 | for (const [index, [name, options]] of optionsCollection.entries()) { 264 | testWrapper(name, async () => { 265 | // @ts-ignore 266 | const tag = createApi("hashtag", hashtagId, options); 267 | const scraped = []; 268 | 269 | for await (const post of tag.generator()) { 270 | expect(post).toBeDefined(); 271 | scraped.push(post); 272 | } 273 | 274 | if (index === 0) { 275 | expect(scraped.length).toBeGreaterThan(total); 276 | } else if (index === optionsCollection.length - 1) { 277 | expect(scraped.length).toBe(5); 278 | } else { 279 | expect(scraped.length).toBe(total); 280 | } 281 | }); 282 | } 283 | }); 284 | 285 | describe("Unusual behavior", () => { 286 | testWrapper("Empty page", async () => { 287 | const user = createApi("user", emptyAccountName, {}).generator(); 288 | const userPosts = []; 289 | for await (const post of user) { 290 | userPosts.push(post); 291 | } 292 | expect(userPosts.length).toBe(0); 293 | }); 294 | 295 | testWrapper("No grafting", async () => { 296 | const total = 100; 297 | const hashtag = hashtags[0]; 298 | const api = new QuickGraft(hashtag, {total, enableGrafting: false}); 299 | const scraped = []; 300 | 301 | for await (const post of api.generator()) { 302 | scraped.push(post); 303 | } 304 | 305 | expect(scraped.length).toBe(total); 306 | }); 307 | 308 | testWrapper("Pausing", async () => { 309 | const api = createApi("hashtag", hashtags[0], {total: 100}); 310 | const iterator = api.generator(); 311 | 312 | api.pause(); 313 | setTimeout(() => { 314 | api.pause(); 315 | }, 20000); 316 | 317 | for await (const post of iterator) { 318 | expect(post).toBeDefined(); 319 | } 320 | }); 321 | 322 | testWrapper("Hibernation", async () => { 323 | const options: IOptions = { 324 | hibernationTime: 10, 325 | total: smallSize, 326 | }; 327 | 328 | const api = createApi("hashtag", hashtags[0], options); 329 | const iterator = api.generator(); 330 | 331 | await iterator.next(); 332 | api.toggleHibernation(); 333 | 334 | for await (const post of iterator) { 335 | expect(post).toBeDefined(); 336 | } 337 | }); 338 | 339 | testWrapper("Failed Page visit", async () => { 340 | const options: IOptions = { 341 | proxyURL: "127.0.0.1:9999", 342 | }; 343 | const api = createApi("hashtag", hashtags[0], options); 344 | const scraped = []; 345 | 346 | try { 347 | for await (const post of api.generator()) { 348 | scraped.push(post); 349 | } 350 | } catch (e) { 351 | expect(e).toBeDefined(); 352 | } 353 | 354 | expect(scraped.length).toBe(0); 355 | }); 356 | }); 357 | 358 | describe("Network and API issues", () => { 359 | async function testOptions(options: IFakePageOptions) { 360 | options.port = await startServer(); 361 | const api = new FakePage(options); 362 | const mock = jest.fn(); 363 | 364 | try { 365 | for await (const post of api.generator()) { 366 | mock(post); 367 | } 368 | } catch (e) { 369 | expect(e).toBeDefined(); 370 | } 371 | await api.forceStop(); 372 | 373 | await stopServer(); 374 | } 375 | 376 | testWrapper("Rate limit", async () => { 377 | await testOptions({ 378 | catchPage: "rate_limit", 379 | options: {hibernationTime: 10}, 380 | }); 381 | }); 382 | 383 | testWrapper("Invalid JSON", async () => { 384 | await testOptions({catchPage: "invalid_json"}); 385 | }); 386 | 387 | testWrapper("Non object", async () => { 388 | await testOptions({catchPage: "non_object"}); 389 | }); 390 | 391 | testWrapper("No next page", async () => { 392 | await testOptions({catchPage: "no_next_page", pageQuery: "data"}); 393 | }); 394 | 395 | testWrapper("Duplicate post ids", async () => { 396 | await testOptions({ 397 | catchPage: "duplicate_ids", 398 | edgeQuery: "data.edges", 399 | pageQuery: "data", 400 | }); 401 | }); 402 | 403 | testWrapper("Invalid post id", async () => { 404 | await testOptions({ 405 | catchPage: "invalid_id", 406 | edgeQuery: "data.edges", 407 | options: {fullAPI: true, total: 1}, 408 | pageQuery: "data", 409 | }); 410 | }); 411 | 412 | testWrapper("Invalid page", async () => { 413 | await testOptions({ 414 | path: "/invalid_page", 415 | }); 416 | }); 417 | }); 418 | 419 | describe("Strict mode", () => { 420 | const failingValidator = t.type({ 421 | foo: t.string, 422 | }); 423 | 424 | testWrapper( 425 | "Should fire warning if strict is false and validations are different", 426 | async () => { 427 | const logger = createLogger(); 428 | logger.warn = jest.fn(); 429 | const iterator = createApi("hashtag", hashtags[0], { 430 | logger, 431 | strict: false, 432 | total: 1, 433 | validator: failingValidator, 434 | }).generator(); 435 | 436 | let i = 0; 437 | for await (const post of iterator) { 438 | i++; 439 | expect(logger.warn).toBeCalledTimes(i); 440 | } 441 | }, 442 | ); 443 | 444 | testWrapper( 445 | "Should not fire warning if strict is false and validations are ok", 446 | async () => { 447 | const logger = createLogger(); 448 | logger.warn = jest.fn(); 449 | const iterator = createApi("hashtag", hashtags[0], { 450 | logger, 451 | strict: false, 452 | total: 1, 453 | }).generator(); 454 | 455 | for await (const post of iterator) { 456 | expect(logger.warn).toBeCalledTimes(0); 457 | } 458 | }, 459 | ); 460 | 461 | testWrapper( 462 | "Should throw validation error if strict is true and types are incorrect", 463 | async () => { 464 | expect.hasAssertions(); 465 | const iterator = createApi("hashtag", hashtags[0], { 466 | strict: true, 467 | total: 1, 468 | validator: failingValidator, 469 | }).generator(); 470 | 471 | try { 472 | await iterator.next(); 473 | } catch (e) { 474 | expect(e).toBeInstanceOf(Error); 475 | expect(e.message).toMatch(/^Invalid value/); 476 | } 477 | }, 478 | ); 479 | 480 | testWrapper( 481 | "Should throw validation error if strict is true and types are incorrect (Post)", 482 | async () => { 483 | expect.hasAssertions(); 484 | const iterator = createApi("post", posts, { 485 | strict: true, 486 | total: 1, 487 | validator: failingValidator, 488 | }).generator(); 489 | 490 | try { 491 | await iterator.next(); 492 | } catch (e) { 493 | expect(e).toBeInstanceOf(Error); 494 | expect(e.message).toMatch(/^Invalid value/); 495 | } 496 | }, 497 | ); 498 | 499 | testWrapper( 500 | "Should throw validation error if strict is true and types are incorrect (Full Mode)", 501 | async () => { 502 | expect.hasAssertions(); 503 | const iterator = createApi("hashtag", hashtags[0], { 504 | fullAPI: true, 505 | strict: true, 506 | total: 1, 507 | validator: failingValidator, 508 | }).generator(); 509 | 510 | try { 511 | await iterator.next(); 512 | } catch (e) { 513 | expect(e).toBeInstanceOf(Error); 514 | expect(e.message).toMatch(/^Invalid value/); 515 | } 516 | }, 517 | ); 518 | }); 519 | 520 | describe("Search", () => { 521 | testWrapper("Search Result Users", async () => { 522 | const result = await createApi( 523 | "search", 524 | "therock", 525 | libraryTestOptions, 526 | ).get(); 527 | expect(result.users.length).toBeGreaterThan(0); 528 | const user = result.users[0].user; 529 | expect(user.username).toBe("therock"); 530 | expect(user.full_name).toBeTruthy(); 531 | expect(user.profile_pic_url).toBeTruthy(); 532 | }); 533 | 534 | testWrapper("Search Result Hashtags", async () => { 535 | const result = await createApi( 536 | "search", 537 | "nofilter", 538 | libraryTestOptions, 539 | ).get(); 540 | expect(result.hashtags.length).toBeGreaterThan(0); 541 | const hashtag = result.hashtags[0].hashtag; 542 | expect(hashtag.media_count).not.toBeUndefined(); 543 | expect(hashtag.name).toBe("nofilter"); 544 | }); 545 | 546 | testWrapper("Search Result Places", async () => { 547 | const result = await createApi( 548 | "search", 549 | "New york", 550 | libraryTestOptions, 551 | ).get(); 552 | expect(result.places.length).toBeGreaterThan(0); 553 | const place = result.places[0].place; 554 | expect(place.title).toMatch(/New York/); 555 | }); 556 | 557 | testWrapper("Incorrect validation", async () => { 558 | const failingValidator = t.type({ 559 | foo: t.string, 560 | }); 561 | 562 | expect.hasAssertions(); 563 | const search = createApi("search", "Doesn't matter", { 564 | strict: true, 565 | validator: failingValidator, 566 | }); 567 | 568 | try { 569 | await search.get(); 570 | } catch (e) { 571 | expect(e).toBeInstanceOf(Error); 572 | expect(e.message).toMatch(/^Invalid value/); 573 | } 574 | await search.forceStop(); 575 | }); 576 | 577 | testWrapper("Search should fire only one network request", async () => { 578 | const searchRequestsSpy = jest.fn(); 579 | 580 | class RequestCounter implements IPlugin { 581 | public async requestEvent( 582 | this: IPluginContext, PostType>, 583 | req: Request, 584 | overrides: Overrides, 585 | ) { 586 | if (this.state.matchURL(req.url())) { 587 | searchRequestsSpy(); 588 | } 589 | } 590 | } 591 | 592 | const search = createApi( 593 | "search", 594 | "A really long long long string to find something in Instagram", 595 | { 596 | plugins: [new RequestCounter()], 597 | }, 598 | ); 599 | 600 | await search.get(); 601 | expect(searchRequestsSpy).toBeCalledTimes(1); 602 | }); 603 | }); 604 | 605 | describe("Plugins", () => { 606 | testWrapper("Internal plugins", async () => { 607 | for (const plugin in plugins) { 608 | if (!plugins.hasOwnProperty(plugin)) { 609 | continue; 610 | } 611 | 612 | const options: IOptions = { 613 | plugins: [new plugins[plugin]()], 614 | silent: true, 615 | total: 100, 616 | }; 617 | const hashtag = createApi("hashtag", hashtags[0], options); 618 | 619 | const mock = jest.fn(); 620 | for await (const post of hashtag.generator()) { 621 | mock(post); 622 | } 623 | expect(mock).toBeCalledTimes(100); 624 | } 625 | }); 626 | }); 627 | 628 | describe("Browser instance passed from outside", () => { 629 | const browserOptions = { 630 | headless: true, 631 | args: ["--no-sandbox", "--disable-setuid-sandbox"], 632 | }; 633 | testWrapper("Should re-use this browser instance", async () => { 634 | const browser = await launch(browserOptions); 635 | 636 | const hashtagGenerator = createApi("hashtag", hashtags[0], { 637 | browserInstance: browser, 638 | }).generator(); 639 | await hashtagGenerator.next(); 640 | 641 | const pages = await browser.pages(); 642 | 643 | expect(pages.length).toBe(2); 644 | 645 | await browser.close(); 646 | }); 647 | 648 | testWrapper("Should not close browser instance", async () => { 649 | const browser = await launch(browserOptions); 650 | 651 | const searchGenerator = createApi("search", "therock", { 652 | browserInstance: browser, 653 | }).generator(); 654 | await searchGenerator.next(); 655 | 656 | expect(browser.isConnected()).toBe(true); 657 | 658 | await browser.close(); 659 | }); 660 | }); 661 | -------------------------------------------------------------------------------- /tests/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "target": "es2018", 5 | "noImplicitAny": false, 6 | "inlineSourceMap": true, 7 | "lib": ["dom", "es2018", "esnext.asynciterable"], 8 | "esModuleInterop": true, 9 | "resolveJsonModule": true 10 | }, 11 | "compileOnSave": false 12 | } 13 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "target": "es2018", 5 | "noImplicitAny": false, 6 | "inlineSourceMap": true, 7 | "lib": ["dom", "es2018", "esnext.asynciterable"], 8 | "resolveJsonModule": true, 9 | "esModuleInterop": true 10 | }, 11 | "include": ["*.ts", "!*.d.ts", "src/cli.ts"], 12 | "exclude": ["node_modules", "tests/*", "examples/*"] 13 | } 14 | -------------------------------------------------------------------------------- /tslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultSeverity": "error", 3 | "extends": [ 4 | "tslint:recommended" 5 | ], 6 | "jsRules": {}, 7 | "rules": { 8 | // Could eventually be re-enabled by removing Hashtag Location User classes 9 | "max-classes-per-file": [false], 10 | 11 | // This needs to be robust enough to support API changes without refactoring 12 | "no-string-literal": false 13 | }, 14 | "rulesDirectory": [] 15 | } -------------------------------------------------------------------------------- /utils/validation-generator/.gitignore: -------------------------------------------------------------------------------- 1 | /input.json 2 | /output.ts -------------------------------------------------------------------------------- /utils/validation-generator/README.md: -------------------------------------------------------------------------------- 1 | # API validation generator 2 | 3 | > Warning! The output which we get from `transform-json-types` library is not perfect. `output.ts` needs to be checked after the automatic transformation. 4 | 5 | This util is used to automatically generate [io-ts](https://github.com/gcanti/io-ts) runtime and type validations for an actual Instagram API. 6 | 7 | To generate these validations two steps are required: 8 | 9 | * Get an actual Instagram API response and save as json 10 | * Get `io-ts` typings from it 11 | 12 | ## Actual API response 13 | 14 | `ts-node utils/validation-generator/get-input.ts` 15 | 16 | The script will save an actual API response for different endpoints in `input.json` file (gitignored) 17 | 18 | ## Generate typings 19 | 20 | > Warning! By some weird reasons these typings are a little bit screwed. Need to replace Node3 with Node inside Post type to make them ok. 21 | 22 | 1. `ts-node utils/validation-generator/generate.ts` (The script will save typing to `output.ts` file.) 23 | 2. Move all primitive types (which does not use other types, like `ThumbnailResources`, `Owner` and others) to the top of the file, final types (like `Post`) to the bottom of the file and fix all the block-scoped variables order errors manually. 24 | 3. Write typing for FullApiPost (generally it is a SinglePost, but with location as an object) 25 | 4. It is better to make the main type excessive by using [io-ts-excess](https://github.com/goooseman/io-ts-excess). Here's an example: 26 | ```typescript 27 | export const SinglePost = t.type({ 28 | shortcode_media: excess(ShortcodeMedia), 29 | }); 30 | ``` 31 | By make this type excessive, you will get validation error, if some new properties appeared in the API. 32 | 5. Move `SearchResult`, `User`, `Places`, `Hashtags` types to `src/api/search.ts` 33 | 6. Fix the rest of the typings 34 | 35 | ## Fix typings 36 | 37 | To quickly find all the typing errors in the project, you can run `npm test -- -t "Strict mode"` and `npm test -- -t "Full API"`. 38 | 39 | You can get a lot of really verbose errors, like: 40 | 41 | ``` typescript 42 | Invalid value 43 | {"id":"219469050","has_public_page":true,"name":"Costa Nova, Aveiro, Portugal","slug":"costa-nova-aveiro-portugal","address_json":"{\"street_address\": \"\", \"zip_code\": \"\", \"city_name\": \"Costa Nova, Aveiro, Portugal\", \"region_name\": \"\", \"country_code\": \"PT\", \"exact_city_match\": true, \"exact_region_match\": false, \"exact_country_match\": false}"} 44 | supplied to : 45 | { shortcode_media: { __typename: string, id: string, shortcode: string, dimensions: { height: number, width: number }, gating_info: (string | null), media_preview: (string | null), display_url: string, display_resources: Array<{ src: string, config_width: number, config_height: number }>, accessibility_caption: (string | undefined), is_video: boolean, should_log_client_event: boolean, tracking_token: string, edge_media_to_tagged_user: { edges: Array<{ node: { text: (string | undefined) } }> }, edge_media_to_caption: { edges: Array<{ node: { text: (string | undefined) } }> }, caption_is_edited: boolean, has_ranked_comments: boolean, edge_media_to_parent_comment: ({ count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: ({ id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } & { edge_threaded_comments: { count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } }) }> } | undefined), edge_media_preview_comment: ({ count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } | undefined), comments_disabled: boolean, taken_at_timestamp: number, edge_media_preview_like: { count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> }, edge_media_to_sponsor_user: { edges: Array<{ node: { text: (string | undefined) } }> }, location: (string | null), viewer_has_liked: boolean, viewer_has_saved: boolean, viewer_has_saved_to_collection: boolean, viewer_in_photo_of_you: boolean, viewer_can_reshare: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string, blocked_by_viewer: boolean, followed_by_viewer: boolean, full_name: string, has_blocked_viewer: boolean, is_private: boolean, is_unpublished: boolean, requested_by_viewer: boolean }, is_ad: boolean, edge_web_media_to_related_media: { edges: Array<{ node: { text: (string | undefined) } }> } } } 46 | /shortcode_media: { __typename: string, id: string, shortcode: string, dimensions: { height: number, width: number }, gating_info: (string | null), media_preview: (string | null), display_url: string, display_resources: Array<{ src: string, config_width: number, config_height: number }>, accessibility_caption: (string | undefined), is_video: boolean, should_log_client_event: boolean, tracking_token: string, edge_media_to_tagged_user: { edges: Array<{ node: { text: (string | undefined) } }> }, edge_media_to_caption: { edges: Array<{ node: { text: (string | undefined) } }> }, caption_is_edited: boolean, has_ranked_comments: boolean, edge_media_to_parent_comment: ({ count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: ({ id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } & { edge_threaded_comments: { count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } }) }> } | undefined), edge_media_preview_comment: ({ count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } | undefined), comments_disabled: boolean, taken_at_timestamp: number, edge_media_preview_like: { count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> }, edge_media_to_sponsor_user: { edges: Array<{ node: { text: (string | undefined) } }> }, location: (string | null), viewer_has_liked: boolean, viewer_has_saved: boolean, viewer_has_saved_to_collection: boolean, viewer_in_photo_of_you: boolean, viewer_can_reshare: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string, blocked_by_viewer: boolean, followed_by_viewer: boolean, full_name: string, has_blocked_viewer: boolean, is_private: boolean, is_unpublished: boolean, requested_by_viewer: boolean }, is_ad: boolean, edge_web_media_to_related_media: { edges: Array<{ node: { text: (string | undefined) } }> } } 47 | /location: (string | null) 48 | /1: null 49 | ``` 50 | 51 | This looks scary, but let's make it simple. We need just two parts from the output. 52 | 53 | The first one is the text representation of the value, which validator could not validate. It is between `Invalid value` and `supplied to` strings. 54 | The second one is the type of value it has expected, and it can be found after last or one before last `/` sign. 55 | 56 | In our case validator expected `string` or `null`, but an object has been recieved. 57 | 58 | So we can fix the typing in the following way: 59 | 60 | ``` typescript 61 | export const Location = t.type({ 62 | id: t.string, 63 | has_public_page: t.boolean, 64 | name: t.string, 65 | slug: t.string, 66 | address_json: t.string, 67 | }); 68 | ... 69 | location: t.union([t.string, t.null, Location]) 70 | 71 | ``` 72 | -------------------------------------------------------------------------------- /utils/validation-generator/generate.ts: -------------------------------------------------------------------------------- 1 | import {writeFileSync} from "fs"; 2 | import {dirname, join} from "path"; 3 | import transform from "transform-json-types"; 4 | // @ts-ignore 5 | import * as json from "./input.json"; 6 | 7 | const getPath = () => join(dirname(__filename), "./output.ts"); 8 | 9 | const removeVarFromCode = (code: string, varName: string): string => { 10 | const regexp = new RegExp(`\nconst ${varName} =[^;]+;\n`, "gm"); 11 | return code.replace(regexp, ""); 12 | }; 13 | 14 | const addTypeToCode = (code: string, typeName: string): string => { 15 | return `${code}\nexport type T${typeName} = t.TypeOf;\n`; 16 | }; 17 | 18 | const singularizeVarNameInCode = ( 19 | code: string, 20 | varNameSingle: string, 21 | ): string => { 22 | const regexp = new RegExp(`${varNameSingle}s`, "gm"); 23 | return code.replace(regexp, varNameSingle); 24 | }; 25 | 26 | let output = transform(json, { 27 | lang: "io-ts", 28 | }); 29 | 30 | output = `import * as t from "io-ts";\n\n${output}`; 31 | output = `// tslint:disable: object-literal-sort-keys\n${output}`; 32 | output = `${output}// tslint:enable: object-literal-sort-keys\n`; 33 | output = removeVarFromCode(output, "RootInterface"); 34 | output = removeVarFromCode(output, "Default"); 35 | output = output.replace(/^const/gm, "export const"); 36 | output = output.replace(/t\.Array/gm, "t.UnknownArray"); 37 | output = output.replace(/\ string/gm, " t.string"); // Really weird 38 | output = output.replace(/t\.Integer/gm, "t.number"); // Integer does not have ts type 39 | output = singularizeVarNameInCode(output, "Post"); 40 | output = singularizeVarNameInCode(output, "SearchResult"); 41 | output = addTypeToCode(output, "Post"); 42 | output = addTypeToCode(output, "SinglePost"); 43 | output = addTypeToCode(output, "SearchResult"); 44 | 45 | writeFileSync(getPath(), output, { 46 | encoding: "utf-8", 47 | }); 48 | -------------------------------------------------------------------------------- /utils/validation-generator/get-input.ts: -------------------------------------------------------------------------------- 1 | import {writeFileSync} from "fs"; 2 | import {dirname, join} from "path"; 3 | import {createApi} from "../../"; 4 | 5 | const getPath = () => join(dirname(__filename), "./input.json"); 6 | 7 | const getResult = async () => { 8 | const posts = await getPosts({ 9 | hashtagId: "beach", 10 | userId: "snoopdogg", 11 | }); 12 | 13 | const singlePosts = await getSinglePosts({ 14 | postsIds: [ 15 | "BsOGulcndj-", 16 | "Be3rTNplCHf", 17 | "BlBvw2_jBKp", 18 | "Bi-hISIghYe", 19 | "BfzEfy-lK1N", 20 | "Bneu_dCHVdn", 21 | "Brx-adXA9C1", 22 | "BlTYHvXFrvm", 23 | "BmRZH7NFwi6", 24 | "BpiIJCUnYwy", 25 | ], 26 | }); 27 | 28 | const searchResults = await getSearch({ 29 | queries: ["beach", "nofilter", "donald"], 30 | }); 31 | 32 | return { 33 | posts, 34 | searchResults, 35 | singlePosts, 36 | }; 37 | }; 38 | 39 | const getSearch = async ({queries}: {queries: string[]}) => { 40 | const result = []; 41 | const objects = queries.map((q) => createApi("search", q, {})); 42 | for (const object of objects) { 43 | result.push(await object.get()); 44 | } 45 | return result; 46 | }; 47 | 48 | const getPosts = async ({ 49 | hashtagId, 50 | userId, 51 | }: { 52 | hashtagId: string; 53 | userId: string; 54 | }) => { 55 | const result = []; 56 | 57 | const options = { 58 | total: 10, 59 | }; 60 | const objects = [ 61 | createApi("hashtag", hashtagId, options), 62 | createApi("user", userId, options), 63 | ]; 64 | 65 | for (const object of objects) { 66 | for await (const post of object.generator()) { 67 | result.push(post); 68 | } 69 | } 70 | return result; 71 | }; 72 | 73 | const getSinglePosts = async ({postsIds}: {postsIds: string[]}) => { 74 | const result = []; 75 | const post = createApi("post", postsIds, {}); 76 | for await (const singlePost of post.generator()) { 77 | result.push(singlePost); 78 | } 79 | return result; 80 | }; 81 | 82 | const run = async () => { 83 | const result = await getResult(); 84 | const json = JSON.stringify(result, null, 2); 85 | writeFileSync(getPath(), json, { 86 | encoding: "utf-8", 87 | }); 88 | }; 89 | 90 | // tslint:disable-next-line: no-console 91 | run().catch(console.error); 92 | --------------------------------------------------------------------------------