├── .github
└── ISSUE_TEMPLATE
│ ├── bug-report.md
│ └── feature_request.md
├── .gitignore
├── .gitlab-ci.yml
├── .npmignore
├── .travis.yml
├── CONTRIBUTING.md
├── FAQ.md
├── LICENSE
├── README.md
├── assets
├── logo.png
└── logo.svg
├── docs
├── .gitignore
├── Gemfile
├── Gemfile.lock
├── _config.yml
├── _layouts
│ └── default.html
├── api-change.md
├── assets
│ └── img
│ │ └── logo.png
├── capture.webm
├── favicon.ico
└── index.md
├── examples
├── README.md
├── complexity.ts
├── package-lock.json
├── package.json
├── server.ts
├── tsconfig.json
└── tslint.json
├── index.ts
├── man
└── instamancer.1
├── package-lock.json
├── package.json
├── plugins
├── README.md
├── index.ts
├── plugin.ts
└── plugins
│ ├── index.ts
│ └── largeFirst.ts
├── src
├── api
│ ├── api.ts
│ ├── instagram.ts
│ ├── postIdSet.ts
│ ├── search.ts
│ └── types.ts
├── cli.ts
├── getpool
│ └── getPool.ts
└── http
│ ├── depot.ts
│ ├── download.ts
│ └── s3.ts
├── tests
├── __fixtures__
│ ├── FakePage.ts
│ └── QuickGraft.ts
├── server.ts
├── test.spec.ts
└── tsconfig.json
├── tsconfig.json
├── tslint.json
└── utils
└── validation-generator
├── .gitignore
├── README.md
├── generate.ts
└── get-input.ts
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help Instamancer improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: ScriptSmith
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior.
15 |
16 | - If the bug is related to the CLI, include the command you used.
17 | - If it's related to using the module, provide some sample code.
18 | - If it's related to the module itself, indicate the source of the problem.
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Output**
24 | If applicable, add Instamancer's output in a code block
25 |
26 | ```
27 | here
28 | ```
29 |
30 | **Setup (please complete the following information):**
31 | - OS: [e.g. Arch Linux, MacOS]
32 | - Instamancer version [e.g. v1.1.4]
33 | - Node version [e.g. v11.6.0]
34 | - NPM version (if applicable) [eg. 6.5.0]
35 |
36 | **Additional context**
37 | Add any other context about the problem here.
38 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FEATURE]"
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | node_modules/
3 | examples/node_modules/
4 | coverage/
5 | downloads/
6 | *.map
7 | *.js
8 | *.d.ts
9 | *.tgz
10 | *.log
11 | *.csv
12 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | default:
2 | image: node:latest
3 | variables:
4 | CI: 1
5 | NO_SANDBOX: 1
6 | before_script:
7 | - npm install -g codacy-coverage
8 |
9 | - apt-get update
10 | - apt-get install -y wget gnupg
11 | - wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
12 | - sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list'
13 | - apt-get update
14 | - apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 --no-install-recommends
15 | - apt-get install -y xvfb
16 | - rm -rf /var/lib/apt/lists/*
17 | script:
18 | - npm install
19 | - npm run build -- --noEmit
20 | - xvfb-run --server-args="-screen 0 1024x768x24" npm run test:ci
21 | after_script:
22 | - cat ./coverage/lcov.info | codacy-coverage --language=typescript;
23 | artifacts:
24 | paths:
25 | - instamancer_tests.log
26 | expire_in: 1 week
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .github/
3 | docs/
4 | assets/
5 | coverage/
6 | test*
7 | .travis.yml
8 | ts*.json
9 | *.js.map
10 | *.log
11 | *.tgz
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 | - "node"
4 | - "lts/*"
5 | dist: bionic
6 | addons:
7 | chrome: stable
8 | artifacts:
9 | paths:
10 | - $(ls *.log | tr "\n" ":")
11 | services:
12 | - xvfb
13 | before_install:
14 | # Enable user namespace cloning for pyppeteer
15 | - sysctl kernel.unprivileged_userns_clone=1
16 | # Launch XVFB for pyppeteer
17 | - export DISPLAY=:99.0
18 | install:
19 | - npm install -g codacy-coverage
20 |
21 | # Install instamancer and deps
22 | - npm install
23 | script:
24 | - npm run build -- --noEmit
25 | - npm run test:ci
26 | - if [[ $TRAVIS_PULL_REQUEST = "false" ]] ; then cat ./coverage/lcov.info | codacy-coverage --language=typescript; fi
27 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Thanks! I'm glad for your interest in the project. Here are some guidelines:
4 |
5 | ## Bugs
6 | Create a [new issue](https://github.com/ScriptSmith/instamancer/issues/new) with the provided template and the `bug` label.
7 |
8 | ## Feature / pull requests
9 | Make sure you submit a new issue with the `feature` label before submitting a pull request. If you aren't sure whether a proposed change is possible / within the scope of the project, just ask.
10 |
11 | ## Chat / Questions
12 | Pop in to the [Gitter](https://gitter.im/instamancer)
--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 | ## Does it still work?
3 | At the time of writing, Instamancer still works. It's possible that it will break when Instagram.com is updated, or Instagram tries to curb this method of scraping.
4 |
5 | There is a daily Travis cron job which tests whether Instamancer is working as expected. You can see the results here: [](https://gitlab.com/ScriptSmith/instamancer/pipelines)
6 |
7 | ## Is there a GUI?
8 | No, Instamancer only works from the command-line. In the future, I might implement a GUI using [Carlo](https://github.com/GoogleChromeLabs/carlo) or something more lightweight.
9 |
10 | There is a instagram data exploring tool in development here: [https://github.com/andyepx/insta-explorer](https://github.com/andyepx/insta-explorer)
11 |
12 | ## Do I need to log in?
13 | No. Instamancer scrapes data that Instagram makes publicly available.
14 |
15 | ## How quickly does it run?
16 | It can processes anywhere from 3-30 posts per second depending on configuration.
17 |
18 | ## Can I make it run faster?
19 | Running without the `--full` and `-d` arguments is faster.
20 |
21 | Not using `--sync` and customising the `-k` option can make downloading files quicker.
22 |
23 | Disabling grafting with `-g=false` will make the scraping quicker at the cost of not being able to access all posts (see [here](#what-happens-if-i-disable-grafting)).
24 |
25 | Setting `--sleep` to a decimal number below 1 speeds up page interactions at the cost of stability, as it makes you more likely to be rate limited.
26 |
27 | Scraping is not parallelisable (see [here](#can-i-run-multiple-instances-at-the-same-time-rather-than-batch-scraping)).
28 |
29 | Using `--plugin LargeFirst` is as much as 5x faster, but may result in undefined behavior.
30 |
31 | If you want something *really* fast, try [Instaphyte](https://github.com/ScriptSmith/instaphyte). It's as much as 12x faster.
32 |
33 | ## Can I run multiple instances at the same time rather than batch scraping?
34 | No. Instagram will probably rate-limit your IP address and then Instamancer will have to pause until the limit is lifted.
35 |
36 | ## What happens if I disable grafting?
37 | Chrome / Chromium will eventually decide that it doesn't want the page to consume any more resources and future requests to the API will be aborted. This usually happens between 5k-10k posts regardless of the memory available on the system. There doesn't seem to be any combination of Chrome flags to avoid this.
38 |
39 | ## How far back can I scrape?
40 | Seemingly as far as there are posts to scrape, but you can only reach old posts by scraping the most recent ones.
41 |
42 | ## How many posts can I scrape from a given endpoint?
43 | The most I've seen is more than 5 million.
44 |
45 | ## How do I scrape the first posts on the page?
46 |
47 | In the default configuration, Instamancer will skip the posts that are pre-loaded on the page. This is because it only retrieves posts generated from API requests, which aren't made for these posts.
48 |
49 | If you would like to retrieve these posts, then you should use full mode: `--full` or `-f`.
50 |
51 | This behavior may change in the future.
52 |
53 | ## How do I use the `--bucket` flag and S3?
54 | 1. Create an S3 bucket. Find help [here](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html).
55 | 2. Configure your AWS credentials. Find help [here](https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/loading-node-credentials-shared.html).
56 | 1. Ensure you can write to S3 with the credentials you're using.
57 | 3. Use instamancer like so:
58 |
59 | ```
60 | instamancer ... -d --bucket=BUCKET_NAME
61 | ```
62 |
63 | Where `BUCKET_NAME` is the name of the bucket.
64 |
65 | Example:
66 |
67 | ```
68 | instamancer hashtag puppies -c10 -d --bucket=instagram-puppies
69 | ```
70 |
71 |
72 | ## How do I use the `--depot` flag and depot?
73 | 1. Set up [depot](https://github.com/ScriptSmith/depot)
74 | 1. Set up basic access authentication if you're using a public server
75 | 2. Generate a UUIDv4
76 | 3. Use instamancer like so:
77 |
78 | ```
79 | instamancer ... -d --depot=http://127.0.0.1:8080/jobs/UUID/
80 | ```
81 |
82 | Where `UUID` is the UUID you generated.
83 |
84 | Example:
85 |
86 | ```
87 | instamancer hashtag puppies -c10 -d --depot=https://depot:password@depot-vlnbfvyaiq-uc.a.run.app/jobs/4cdc21fe-6b35-473a-b26e-66f62ad66c4c/
88 | ```
89 |
90 | You can use any server that accepts `PUT` requests.
91 |
92 |
93 | ## What does a batchfile look like?
94 | ```
95 | hashtag spring -d --full
96 | hashtag summer -f=data.json
97 | user greg -c100
98 | ```
99 |
100 | ## Why does the code have so many comments?
101 | Instamancer was originally part of another project written in Python that used the [Pyppeteer](https://github.com/miyakogi/pyppeteer) clone of Puppeteer. This version was too error-prone because of the complicated asyncio code and Pyppeteer's instability when communicating via websockets during long scraping jobs.
102 |
103 | I decided to rewrite Instamancer in TypeScript in order to be more stable and in-sync with Puppeteer. It was the first time I'd written any serious TypeScript or 'modern' JavaScript (promises, async/await etc.), so the zealous commenting helped me learn, and allowed me to figure out bugs in my algorithm and the grafting process. The comments aren't a permanent fixture and may be removed in a future commit.
104 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Adam Smith
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Instamancer
6 |
7 | [](https://app.codacy.com/project/ScriptSmith/instamancer/dashboard)
8 | [](https://app.codacy.com/project/ScriptSmith/instamancer/dashboard)
9 | [](https://scriptsmith.github.io/instagram-speed-test)
10 | [](https://www.npmjs.com/package/instamancer)
11 | [](https://david-dm.org/scriptsmith/instamancer)
12 | [](https://gitter.im/instamancer)
13 |
14 | Scrape Instagram's API with Puppeteer.
15 |
16 | ###### [Install](#Install) | [Usage](#Usage) | [Comparison](#Comparison) | [Website](https://scriptsmith.github.io/instamancer/) | [FAQ](FAQ.md) | [Examples](examples/README.md)
17 |
18 |
19 |
20 | **Notice:** Instagram's Web UI and API now requires users to be logged in to access hashtag and account endpoints through a browser. As instamancer is designed to access publicly available data, it currently does not work as intended. Given that this change is unlikely to be reversed, Instamancer will remain unsupported and unmaintained indefinitely. Please use [this pinned issue](https://github.com/ScriptSmith/instamancer/issues/58) to discuss.
21 |
22 |
23 |
24 |
25 | Instamancer is a new type of scraping tool that leverages Puppeteer's ability to intercept requests made by a webpage to an API.
26 |
27 | Read more about how Instamancer works [here](https://scriptsmith.github.io/instamancer/).
28 |
29 | ### Features
30 | - Scrape hashtags, users' posts, and individual posts
31 | - Download images, albums, and videos
32 | - Output JSON, CSV
33 | - Batch scraping
34 | - Search hashtags, users, and locations
35 | - API response validation
36 | - Upload files to [S3](https://github.com/ScriptSmith/instamancer/blob/master/FAQ.md#how-do-i-use-the---bucket-flag-and-s3) and [depot](https://github.com/ScriptSmith/instamancer/blob/master/FAQ.md#how-do-i-use-the---depot-flag-and-depot)
37 | - [Plugins](plugins)
38 |
39 | ### Data
40 | Metadata that Instamancer is able to gather from posts:
41 |
42 | - Text
43 | - Timestamps
44 | - Tagged users
45 | - Accessibility captions
46 | - Like counts
47 | - Comment counts
48 | - Images (Thumbnails, Dimensions, URLs)
49 | - Videos (URL, View count, Duration)
50 | - Comments (Timestamp, Text, Like count, User)
51 | - User (Username, Full name, Profile picture, Profile privacy)
52 | - Location (Name, Street, Zip code, City, Region, Country)
53 | - Sponsored status
54 | - Gating information
55 | - Fact checking information
56 |
57 | ## Install
58 |
59 | #### Linux
60 | Enable user namespace cloning:
61 | ```
62 | sysctl -w kernel.unprivileged_userns_clone=1
63 | ```
64 |
65 | Or run without a sandbox:
66 |
67 | ```
68 | # WARNING: unsafe
69 | export NO_SANDBOX=true
70 | ```
71 |
72 | See [Puppeteer troubleshooting](https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md#chrome-headless-fails-due-to-sandbox-issues)
73 |
74 | #### Without downloading chromium
75 | If you wish to install Instamancer without downloading chromium, enable the `PUPPETEER_SKIP_CHROMIUM_DOWNLOAD` environment variable before installation
76 |
77 | ```
78 | export PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
79 | ```
80 |
81 | ### From NPM
82 |
83 | ```
84 | npm install -g instamancer
85 | ```
86 |
87 | If you're using root to install globally, use the following command to install the Puppeteer dependency
88 |
89 | ```
90 | sudo npm install -g instamancer --unsafe-perm=true
91 | ```
92 |
93 | ### From NPX
94 |
95 | ```
96 | npx instamancer
97 | ```
98 |
99 | ### From this repository
100 | ```
101 | git clone https://github.com/ScriptSmith/instamancer.git
102 | cd instamancer
103 | npm install
104 | npm run build
105 | npm install -g
106 | ```
107 |
108 | ## Usage
109 |
110 | ### Command Line
111 | ```
112 | $ instamancer
113 | Usage: instamancer [options]
114 |
115 | Commands:
116 | instamancer hashtag [id] Scrape a hashtag
117 | instamancer user [id] Scrape a users posts
118 | instamancer post [ids] Scrape a comma-separated list of posts
119 | instamancer search [query] Perform a search of users, tags and places
120 | instamancer batch [batchfile] Read newline-separated arguments from a file
121 |
122 | Configuration
123 | --count, -c Number of posts to download (0 for all) [number] [default: 0]
124 | --full, -f Retrieve full post data [boolean] [default: false]
125 | --sleep, -s Seconds to sleep between interactions [number] [default: 2]
126 | --graft, -g Enable grafting [boolean] [default: true]
127 | --browser, -b Browser path. Defaults to the puppeteer version [string]
128 | --sameBrowser Use a single browser when grafting [boolean] [default: false]
129 |
130 | Download
131 | --download, -d Save images from posts [boolean] [default: false]
132 | --downdir Download path [default: "downloads/[endpoint]/[id]"]
133 | --video, -v Download videos (requires full) [boolean] [default: false]
134 | --sync Force download between requests [boolean] [default: false]
135 | --threads, -k Parallel download / depot threads [number] [default: 4]
136 | --waitDownload, -w Download media after scraping [boolean] [default: false]
137 |
138 | Upload
139 | --bucket Upload files to an AWS S3 bucket [string]
140 | --depot Upload files to a URL with a PUT request (depot) [string]
141 |
142 | Output
143 | --file, -o Output filename. '-' for stdout [string] [default: "[id]"]
144 | --type, -t Filetype [choices: "csv", "json", "both"] [default: "json"]
145 | --mediaPath, -m Add filepaths to _mediaPath [boolean] [default: false]
146 |
147 | Display
148 | --visible Show browser on the screen [boolean] [default: false]
149 | --quiet, -q Disable progress output [boolean] [default: false]
150 |
151 | Logging
152 | --logging, -l [choices: "none", "error", "info", "debug"] [default: "none"]
153 | --logfile Log file name [string] [default: "instamancer.log"]
154 |
155 | Validation
156 | --strict Throw an error on response type mismatch [boolean] [default: false]
157 |
158 | Plugins
159 | --plugin, -p Use a plugin from the plugins directory [array] [default: []]
160 |
161 | Options:
162 | --help Show help [boolean]
163 | --version Show version number [boolean]
164 |
165 | Examples:
166 | instamancer hashtag instagood -fvd Download all the available posts,
167 | and their media from #instagood
168 | instamancer user arianagrande --type=csv Download Ariana Grande's posts to a
169 | --logging=info --visible CSV file with a non-headless
170 | browser, and log all events
171 |
172 | Source code available at https://github.com/ScriptSmith/instamancer
173 |
174 | ```
175 |
176 | ### Module
177 |
178 | ES2018 Typescript example:
179 | ```typescript
180 | import {createApi, IOptions} from "instamancer"
181 |
182 | const options: IOptions = {
183 | total: 10
184 | };
185 | const hashtag = createApi("hashtag", "beach", options);
186 |
187 | (async () => {
188 | for await (const post of hashtag.generator()) {
189 | console.log(post);
190 | }
191 | })();
192 | ```
193 |
194 | #### Generator functions
195 |
196 | ```typescript
197 | import {createApi} from "instamancer"
198 |
199 | createApi("hashtag", id, options);
200 | createApi("user", id, options);
201 | createApi("post", ids, options);
202 | createApi("search", query, options);
203 | ```
204 |
205 | #### Options
206 | ```typescript
207 | const options: Instamancer.IOptions = {
208 | // Total posts to download. 0 for unlimited
209 | total: number,
210 |
211 | // Run Chrome in headless mode
212 | headless: boolean,
213 |
214 | // Logging events
215 | logger: winston.Logger,
216 |
217 | // Run without output to stdout
218 | silent: boolean,
219 |
220 | // Time to sleep between interactions with the page
221 | sleepTime: number,
222 |
223 | // Throw an error if type validation has been failed
224 | strict: boolean,
225 |
226 | // Time to sleep when rate-limited
227 | hibernationTime: number,
228 |
229 | // Enable the grafting process
230 | enableGrafting: boolean,
231 |
232 | // Extract the full amount of information from the API
233 | fullAPI: boolean,
234 |
235 | // Use a proxy in Chrome to connect to Instagram
236 | proxyURL: string,
237 |
238 | // Location of the chromium / chrome binary executable
239 | executablePath: string,
240 |
241 | // Custom io-ts validator
242 | validator: Type,
243 |
244 | // Custom plugins
245 | plugins: IPlugin[]
246 | }
247 | ```
248 |
249 | ## Comparison
250 |
251 | A comparison of Instagram scraping tools. Please suggest more tools and criteria through a pull request.
252 |
253 | To see a speed comparison, visit [this page](https://scriptsmith.github.io/instagram-speed-test)
254 |
255 |
256 |
257 |
258 | Tool
259 | Hashtags
260 | Users
261 | Tagged posts
262 | Locations
263 | Posts
264 | Stories
265 | Login not required
266 | Private feeds
267 | Batch mode
268 | Plugins
269 | Command-line
270 | Library/Module
271 | Download media
272 | Download metadata
273 | Scraping method
274 | Daily builds
275 | Main language
276 | Speed ____________________________
277 | License ____________________________
278 | Last commit ____________________________
279 | Open Issues ____________________________
280 | Closed Issues ____________________________
281 | Build status ____________________________
282 | Test coverage ____________________________
283 | Code quality ____________________________
284 |
285 |
286 |
287 |
288 | Instamancer
289 | :heavy_check_mark:
290 | :heavy_check_mark:
291 | :x:
292 | :x:
293 | :heavy_check_mark:
294 | :x:
295 | :heavy_check_mark:
296 | :x:
297 | :heavy_check_mark:
298 | :heavy_check_mark:
299 | :heavy_check_mark:
300 | :heavy_check_mark:
301 | :heavy_check_mark:
302 | :heavy_check_mark:
303 | Web API request interception
304 | :heavy_check_mark:
305 | Typescript
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 | Instaphyte
317 | :heavy_check_mark:
318 | :x:
319 | :x:
320 | :x:
321 | :x:
322 | :x:
323 | :heavy_check_mark:
324 | :x:
325 | :x:
326 | :x:
327 | :heavy_check_mark:
328 | :heavy_check_mark:
329 | :heavy_check_mark:
330 | :heavy_check_mark:
331 | Web API simulation
332 | :heavy_check_mark:
333 | Python
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 | Instaloader
345 | :heavy_check_mark:
346 | :heavy_check_mark:
347 | :heavy_check_mark:
348 | :heavy_check_mark:
349 | :heavy_check_mark:
350 | :heavy_check_mark:
351 | :heavy_check_mark:
352 | :heavy_check_mark:
353 | :x:
354 | :x:
355 | :heavy_check_mark:
356 | :heavy_check_mark:
357 | :heavy_check_mark:
358 | :heavy_check_mark:
359 | Web API simulation
360 | :x:
361 | Python
362 |
363 |
364 |
365 |
366 |
367 |
368 | :question:
369 | :question:
370 |
371 |
372 | Instalooter
373 | :heavy_check_mark:
374 | :heavy_check_mark:
375 | :x:
376 | :heavy_check_mark:
377 | :heavy_check_mark:
378 | :x:
379 | :x:
380 | :heavy_check_mark:
381 | :heavy_check_mark:
382 | :x:
383 | :heavy_check_mark:
384 | :heavy_check_mark:
385 | :heavy_check_mark:
386 | :heavy_check_mark:
387 | Web API simulation
388 | :x:
389 | Python
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 | Instagram crawler
401 | :heavy_check_mark:
402 | :heavy_check_mark:
403 | :x:
404 | :x:
405 | :heavy_check_mark:
406 | :x:
407 | :heavy_check_mark:
408 | :x:
409 | :x:
410 | :x:
411 | :heavy_check_mark:
412 | :heavy_check_mark:
413 | :x:
414 | :heavy_check_mark:
415 | Web DOM reading
416 | :x:
417 | Python
418 | :question:
419 |
420 |
421 |
422 |
423 |
424 | :question:
425 | :question:
426 |
427 |
428 | Instagram Scraper
429 | :heavy_check_mark:
430 | :heavy_check_mark:
431 | :heavy_check_mark:
432 | :heavy_check_mark:
433 | :x:
434 | :heavy_check_mark:
435 | :x:
436 | :heavy_check_mark:
437 | :x:
438 | :x:
439 | :heavy_check_mark:
440 | :heavy_check_mark:
441 | :heavy_check_mark:
442 | :heavy_check_mark:
443 | Web API simulation
444 | :x:
445 | Python
446 |
447 |
448 |
449 |
450 |
451 |
452 | :question:
453 | :question:
454 |
455 |
456 | Instagram Private API
457 | :heavy_check_mark:
458 | :heavy_check_mark:
459 | :heavy_check_mark:
460 | :heavy_check_mark:
461 | :heavy_check_mark:
462 | :heavy_check_mark:
463 | :heavy_check_mark:
464 | :heavy_check_mark:
465 | :x:
466 | :x:
467 | :x:
468 | :heavy_check_mark:
469 | :heavy_check_mark:
470 | :heavy_check_mark:
471 | App and Web API simulation
472 | :x:
473 | Python
474 | :question:
475 |
476 |
477 |
478 |
479 |
480 | :question:
481 | :question:
482 |
483 |
484 | Instagram PHP Scraper
485 | :heavy_check_mark:
486 | :heavy_check_mark:
487 | :x:
488 | :heavy_check_mark:
489 | :heavy_check_mark:
490 | :x:
491 | :heavy_check_mark:
492 | :heavy_check_mark:
493 | :x:
494 | :x:
495 | :x:
496 | :heavy_check_mark:
497 | :heavy_check_mark:
498 | :heavy_check_mark:
499 | Web API simulation
500 | :x:
501 | PHP
502 | :question:
503 |
504 |
505 |
506 |
507 | :question:
508 | :question:
509 | :question:
510 |
511 |
512 |
513 |
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/assets/logo.png
--------------------------------------------------------------------------------
/assets/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
15 |
17 |
18 |
20 |
22 |
26 |
30 |
31 |
33 |
37 |
41 |
42 |
44 |
48 |
52 |
53 |
56 |
60 |
61 |
70 |
79 |
88 |
96 |
97 |
101 |
105 |
109 |
113 |
117 |
121 |
122 |
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _site/
2 | .jekyll-metadata
--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gem "github-pages", group: :jekyll_plugins
--------------------------------------------------------------------------------
/docs/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | activesupport (4.2.11.1)
5 | i18n (~> 0.7)
6 | minitest (~> 5.1)
7 | thread_safe (~> 0.3, >= 0.3.4)
8 | tzinfo (~> 1.1)
9 | addressable (2.7.0)
10 | public_suffix (>= 2.0.2)
11 | coffee-script (2.4.1)
12 | coffee-script-source
13 | execjs
14 | coffee-script-source (1.11.1)
15 | colorator (1.1.0)
16 | commonmarker (0.17.13)
17 | ruby-enum (~> 0.5)
18 | concurrent-ruby (1.1.5)
19 | dnsruby (1.61.3)
20 | addressable (~> 2.5)
21 | em-websocket (0.5.1)
22 | eventmachine (>= 0.12.9)
23 | http_parser.rb (~> 0.6.0)
24 | ethon (0.12.0)
25 | ffi (>= 1.3.0)
26 | eventmachine (1.2.7)
27 | execjs (2.7.0)
28 | faraday (0.17.0)
29 | multipart-post (>= 1.2, < 3)
30 | ffi (1.11.1)
31 | forwardable-extended (2.6.0)
32 | gemoji (3.0.1)
33 | github-pages (201)
34 | activesupport (= 4.2.11.1)
35 | github-pages-health-check (= 1.16.1)
36 | jekyll (= 3.8.5)
37 | jekyll-avatar (= 0.6.0)
38 | jekyll-coffeescript (= 1.1.1)
39 | jekyll-commonmark-ghpages (= 0.1.6)
40 | jekyll-default-layout (= 0.1.4)
41 | jekyll-feed (= 0.11.0)
42 | jekyll-gist (= 1.5.0)
43 | jekyll-github-metadata (= 2.12.1)
44 | jekyll-mentions (= 1.4.1)
45 | jekyll-optional-front-matter (= 0.3.0)
46 | jekyll-paginate (= 1.1.0)
47 | jekyll-readme-index (= 0.2.0)
48 | jekyll-redirect-from (= 0.14.0)
49 | jekyll-relative-links (= 0.6.0)
50 | jekyll-remote-theme (= 0.4.0)
51 | jekyll-sass-converter (= 1.5.2)
52 | jekyll-seo-tag (= 2.5.0)
53 | jekyll-sitemap (= 1.2.0)
54 | jekyll-swiss (= 0.4.0)
55 | jekyll-theme-architect (= 0.1.1)
56 | jekyll-theme-cayman (= 0.1.1)
57 | jekyll-theme-dinky (= 0.1.1)
58 | jekyll-theme-hacker (= 0.1.1)
59 | jekyll-theme-leap-day (= 0.1.1)
60 | jekyll-theme-merlot (= 0.1.1)
61 | jekyll-theme-midnight (= 0.1.1)
62 | jekyll-theme-minimal (= 0.1.1)
63 | jekyll-theme-modernist (= 0.1.1)
64 | jekyll-theme-primer (= 0.5.3)
65 | jekyll-theme-slate (= 0.1.1)
66 | jekyll-theme-tactile (= 0.1.1)
67 | jekyll-theme-time-machine (= 0.1.1)
68 | jekyll-titles-from-headings (= 0.5.1)
69 | jemoji (= 0.10.2)
70 | kramdown (= 1.17.0)
71 | liquid (= 4.0.0)
72 | listen (= 3.1.5)
73 | mercenary (~> 0.3)
74 | minima (= 2.5.0)
75 | nokogiri (>= 1.10.4, < 2.0)
76 | rouge (= 3.11.0)
77 | terminal-table (~> 1.4)
78 | github-pages-health-check (1.16.1)
79 | addressable (~> 2.3)
80 | dnsruby (~> 1.60)
81 | octokit (~> 4.0)
82 | public_suffix (~> 3.0)
83 | typhoeus (~> 1.3)
84 | html-pipeline (2.12.0)
85 | activesupport (>= 2)
86 | nokogiri (>= 1.4)
87 | http_parser.rb (0.6.0)
88 | i18n (0.9.5)
89 | concurrent-ruby (~> 1.0)
90 | jekyll (3.8.5)
91 | addressable (~> 2.4)
92 | colorator (~> 1.0)
93 | em-websocket (~> 0.5)
94 | i18n (~> 0.7)
95 | jekyll-sass-converter (~> 1.0)
96 | jekyll-watch (~> 2.0)
97 | kramdown (~> 1.14)
98 | liquid (~> 4.0)
99 | mercenary (~> 0.3.3)
100 | pathutil (~> 0.9)
101 | rouge (>= 1.7, < 4)
102 | safe_yaml (~> 1.0)
103 | jekyll-avatar (0.6.0)
104 | jekyll (~> 3.0)
105 | jekyll-coffeescript (1.1.1)
106 | coffee-script (~> 2.2)
107 | coffee-script-source (~> 1.11.1)
108 | jekyll-commonmark (1.3.1)
109 | commonmarker (~> 0.14)
110 | jekyll (>= 3.7, < 5.0)
111 | jekyll-commonmark-ghpages (0.1.6)
112 | commonmarker (~> 0.17.6)
113 | jekyll-commonmark (~> 1.2)
114 | rouge (>= 2.0, < 4.0)
115 | jekyll-default-layout (0.1.4)
116 | jekyll (~> 3.0)
117 | jekyll-feed (0.11.0)
118 | jekyll (~> 3.3)
119 | jekyll-gist (1.5.0)
120 | octokit (~> 4.2)
121 | jekyll-github-metadata (2.12.1)
122 | jekyll (~> 3.4)
123 | octokit (~> 4.0, != 4.4.0)
124 | jekyll-mentions (1.4.1)
125 | html-pipeline (~> 2.3)
126 | jekyll (~> 3.0)
127 | jekyll-optional-front-matter (0.3.0)
128 | jekyll (~> 3.0)
129 | jekyll-paginate (1.1.0)
130 | jekyll-readme-index (0.2.0)
131 | jekyll (~> 3.0)
132 | jekyll-redirect-from (0.14.0)
133 | jekyll (~> 3.3)
134 | jekyll-relative-links (0.6.0)
135 | jekyll (~> 3.3)
136 | jekyll-remote-theme (0.4.0)
137 | addressable (~> 2.0)
138 | jekyll (~> 3.5)
139 | rubyzip (>= 1.2.1, < 3.0)
140 | jekyll-sass-converter (1.5.2)
141 | sass (~> 3.4)
142 | jekyll-seo-tag (2.5.0)
143 | jekyll (~> 3.3)
144 | jekyll-sitemap (1.2.0)
145 | jekyll (~> 3.3)
146 | jekyll-swiss (0.4.0)
147 | jekyll-theme-architect (0.1.1)
148 | jekyll (~> 3.5)
149 | jekyll-seo-tag (~> 2.0)
150 | jekyll-theme-cayman (0.1.1)
151 | jekyll (~> 3.5)
152 | jekyll-seo-tag (~> 2.0)
153 | jekyll-theme-dinky (0.1.1)
154 | jekyll (~> 3.5)
155 | jekyll-seo-tag (~> 2.0)
156 | jekyll-theme-hacker (0.1.1)
157 | jekyll (~> 3.5)
158 | jekyll-seo-tag (~> 2.0)
159 | jekyll-theme-leap-day (0.1.1)
160 | jekyll (~> 3.5)
161 | jekyll-seo-tag (~> 2.0)
162 | jekyll-theme-merlot (0.1.1)
163 | jekyll (~> 3.5)
164 | jekyll-seo-tag (~> 2.0)
165 | jekyll-theme-midnight (0.1.1)
166 | jekyll (~> 3.5)
167 | jekyll-seo-tag (~> 2.0)
168 | jekyll-theme-minimal (0.1.1)
169 | jekyll (~> 3.5)
170 | jekyll-seo-tag (~> 2.0)
171 | jekyll-theme-modernist (0.1.1)
172 | jekyll (~> 3.5)
173 | jekyll-seo-tag (~> 2.0)
174 | jekyll-theme-primer (0.5.3)
175 | jekyll (~> 3.5)
176 | jekyll-github-metadata (~> 2.9)
177 | jekyll-seo-tag (~> 2.0)
178 | jekyll-theme-slate (0.1.1)
179 | jekyll (~> 3.5)
180 | jekyll-seo-tag (~> 2.0)
181 | jekyll-theme-tactile (0.1.1)
182 | jekyll (~> 3.5)
183 | jekyll-seo-tag (~> 2.0)
184 | jekyll-theme-time-machine (0.1.1)
185 | jekyll (~> 3.5)
186 | jekyll-seo-tag (~> 2.0)
187 | jekyll-titles-from-headings (0.5.1)
188 | jekyll (~> 3.3)
189 | jekyll-watch (2.2.1)
190 | listen (~> 3.0)
191 | jemoji (0.10.2)
192 | gemoji (~> 3.0)
193 | html-pipeline (~> 2.2)
194 | jekyll (~> 3.0)
195 | kramdown (1.17.0)
196 | liquid (4.0.0)
197 | listen (3.1.5)
198 | rb-fsevent (~> 0.9, >= 0.9.4)
199 | rb-inotify (~> 0.9, >= 0.9.7)
200 | ruby_dep (~> 1.2)
201 | mercenary (0.3.6)
202 | mini_portile2 (2.4.0)
203 | minima (2.5.0)
204 | jekyll (~> 3.5)
205 | jekyll-feed (~> 0.9)
206 | jekyll-seo-tag (~> 2.1)
207 | minitest (5.12.2)
208 | multipart-post (2.1.1)
209 | nokogiri (1.10.8)
210 | mini_portile2 (~> 2.4.0)
211 | octokit (4.14.0)
212 | sawyer (~> 0.8.0, >= 0.5.3)
213 | pathutil (0.16.2)
214 | forwardable-extended (~> 2.6)
215 | public_suffix (3.1.1)
216 | rb-fsevent (0.10.3)
217 | rb-inotify (0.10.0)
218 | ffi (~> 1.0)
219 | rouge (3.11.0)
220 | ruby-enum (0.7.2)
221 | i18n
222 | ruby_dep (1.5.0)
223 | rubyzip (2.0.0)
224 | safe_yaml (1.0.5)
225 | sass (3.7.4)
226 | sass-listen (>= 4.0.0)
227 | sass-listen (4.0.0)
228 | rb-inotify (>= 0.9.7, >= 0.9)
229 | sawyer (0.8.2)
230 | addressable (>= 2.3.5)
231 | faraday (> 0.8, < 2.0)
232 | terminal-table (1.8.0)
233 | unicode-display_width (~> 1.1, >= 1.1.1)
234 | thread_safe (0.3.6)
235 | typhoeus (1.3.1)
236 | ethon (>= 0.9.0)
237 | tzinfo (1.2.5)
238 | thread_safe (~> 0.1)
239 | unicode-display_width (1.6.0)
240 |
241 | PLATFORMS
242 | ruby
243 |
244 | DEPENDENCIES
245 | github-pages
246 |
247 | BUNDLED WITH
248 | 2.0.1
249 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | title: Instamancer
2 | logo: /assets/img/logo.png
3 | description: Scrape Instagram's API with Puppeteer.
4 | show_downloads: false
5 | theme: jekyll-theme-minimal
6 | repository: ScriptSmith/instamancer
7 | google_analytics: UA-79900226-3
8 |
--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | {% if site.logo %}
9 |
10 | {% endif %}
11 |
12 | {% seo %}
13 |
14 |
17 |
18 |
19 |
20 |
46 |
47 |
48 | {{ content }}
49 |
50 |
51 |
57 |
58 |
59 | {% if site.google_analytics %}
60 |
68 | {% endif %}
69 |
70 |
71 |
--------------------------------------------------------------------------------
/docs/api-change.md:
--------------------------------------------------------------------------------
1 | # The Instagram API has changed
2 | Because of the way instamancer works, when Instagram changes the API for their web frontend, the data that Instamancer gathers will be affected.
3 |
4 | If you see this warning, you can:
5 |
6 | - Check for [updates](https://github.com/ScriptSmith/instamancer/releases). A new version of instamancer may have been released.
7 | - Look reports in [open issues](https://github.com/ScriptSmith/instamancer/issues). Maybe someone else is having this problem, and is already working on a fix.
8 | - Open a [new issue](https://github.com/ScriptSmith/instamancer/issues/new/choose) if you can't find an existing one.
9 | - Create a fork of the repository and [fix the typings](https://github.com/ScriptSmith/instamancer/blob/master/utils/validation-generator/README.md#fix-typings) yourself.
10 |
--------------------------------------------------------------------------------
/docs/assets/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/docs/assets/img/logo.png
--------------------------------------------------------------------------------
/docs/capture.webm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/docs/capture.webm
--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScriptSmith/instamancer/1c63cad47886d0831ae6cc44812b72aefa7414a9/docs/favicon.ico
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # About Instamancer
6 |
7 | Instamancer is a scraping tool used in Instagram data mining and analysis projects.
8 |
9 | Traditional Instagram scrapers either use a browser to access a web-page and read the DOM, or they manually reimplement the requests that browsers make to an API endpoint. This isn't ideal because:
10 |
11 | 1. Reading the DOM ignores some information that's only stored in memory.
12 | 2. Reimplementing requests requires the deciphering and reproduction of pagination and authentication mechanisms.
13 | 3. Both methods don't easily tolerate changes to the front and back end.
14 |
15 | Instamancer is unique because it doesn't read the DOM or reimplement requests. Using [Puppeteer](https://github.com/GoogleChrome/puppeteer/) it interacts with Instagram.com, then intercepts and saves the responses to requests that the page's JavaScript initiates. This means that it can retrieve the full amount of information from the API while tolerating failed requests and rate limits, without having to reimplement client-side code. This makes it much better at withstanding regular changes to the interface and API.
16 |
17 | As browsers become more and more like black boxes, this new scraping method will become increasingly relevant.
18 |
19 | Instamancer also comes with some clever tricks:
20 |
21 | - Because using a browser consumes lots of memory in large scraping jobs, Instamancer employs a new scraping technique called *grafting*. It intercepts and saves the URL and headers of each request, and then after a certain number of interactions with the page it will restart the browser and navigate back to the same page. Once the page initiates the first request to the API, its URL and headers are swapped on-the-fly with the most recently saved ones. The scraping continues without incident because the response from the API is in the correct form despite being for the incorrect data.
22 | - Requests from pages for media and other non-API urls are intercepted and aborted to speed up scraping and conserve resources.
23 | - Instagram sends limited information through its feed API. To get extra information like the location, tagged users, and comments, Instamancer can open new tabs for each post that it scrapes, and then read the metadata from memory.
24 |
25 | # Installation
26 |
27 | To get started with Instamancer, follow the installation instructions [here](https://github.com/ScriptSmith/instamancer#Install)
28 |
29 | # Output
30 |
31 | ## Metadata
32 |
33 | Instamancer outputs metadata into JSON and CSV files.
34 |
35 | Here's a sample of output without `--full` mode:
36 |
37 | ```json
38 | [
39 | {
40 | "node": {
41 | "comments_disabled": false,
42 | "__typename": "GraphImage",
43 | "id": "1953636359851103977",
44 | "edge_media_to_caption": {
45 | "edges": [
46 | {
47 | "node": {
48 | "text": "Love my #dogs"
49 | }
50 | }
51 | ]
52 | },
53 | "shortcode": "BsrrAClca9F",
54 | "edge_media_to_comment": {
55 | "count": 1
56 | },
57 | "taken_at_timestamp": 1547102918,
58 | "dimensions": {
59 | "height": 1350,
60 | "width": 1080
61 | },
62 | "display_url": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5edccf8779ca7659a5ee7bb3e5bb0ec4/5CD38B5F/t51.2885-15/e35/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
63 | "edge_liked_by": {
64 | "count": 3
65 | },
66 | "edge_media_preview_like": {
67 | "count": 3
68 | },
69 | "owner": {
70 | "id": "1838071775"
71 | },
72 | "thumbnail_src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5d074edce4bd1bdb02cadb670dd62571/5CBF791C/t51.2885-15/sh0.08/e35/c0.135.1080.1080/s640x640/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
73 | "thumbnail_resources": [
74 | {
75 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/418024ac735200f61193e0de0bc2b79f/5CC9DD07/t51.2885-15/e35/c0.135.1080.1080/s150x150/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
76 | "config_width": 150,
77 | "config_height": 150
78 | },
79 | {
80 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/ca0843efc1fa41da05f401d1d2d99c80/5CC6C84D/t51.2885-15/e35/c0.135.1080.1080/s240x240/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
81 | "config_width": 240,
82 | "config_height": 240
83 | },
84 | {
85 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5560c9aa0cbaf43d93b9f57da63f46ae/5CD068F7/t51.2885-15/e35/c0.135.1080.1080/s320x320/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
86 | "config_width": 320,
87 | "config_height": 320
88 | },
89 | {
90 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/1842510041138b9f71cba3a7e7991f47/5CCEDFAD/t51.2885-15/e35/c0.135.1080.1080/s480x480/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
91 | "config_width": 480,
92 | "config_height": 480
93 | },
94 | {
95 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5d074edce4bd1bdb02cadb670dd62571/5CBF791C/t51.2885-15/sh0.08/e35/c0.135.1080.1080/s640x640/49522041_130894740706474_725467490028727537_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
96 | "config_width": 640,
97 | "config_height": 640
98 | }
99 | ],
100 | "is_video": false,
101 | "accessibility_caption": "Image may contain: 1 person, dog, outdoor, closeup, water and nature"
102 | }
103 | }
104 | ]
105 | ```
106 |
107 | And with `--full` mode:
108 |
109 | ```json
110 | [
111 | {
112 | "shortcode_media": {
113 | "__typename": "GraphImage",
114 | "id": "1958565413572638000",
115 | "shortcode": "BsHcdeHyEgY",
116 | "dimensions": {
117 | "height": 1349,
118 | "width": 1080
119 | },
120 | "gating_info": null,
121 | "media_preview": "ACEqQWKuuSmQWblCFPU8YPGPQc/WpoLWFSGRSpzwWJB7cH056eh9qvwlU3KeNrn8m+YfzxUMk8e4gfNn7wHT/wDXWd/IuwGMEkgsPXJ6H3zntR5Jzwcj04/nj/61EU6sflOT6Hr/APXHp3Hv0q0u05wMHuPShyt0CxT2D3/Sirm3/OKKXMPlM1sgEjq2B9akEIhXL/KfXGc/l/KoTcQHlWJOeBtPH/6qtI5ZQT0HXP8AP/69Uk3sK6RlzRuMMePccf59s81dUtcEGV9igAfLwT6kt2B9KDdI+VABx0yOPc89hWRK/mOxXkHpgZz/APWoa7iuafl2v/PWisjn+7RRYdyM8+/+f896lhmki+UHcB/CeR/n8cVWJI6U89BVElmS5aRQDg47AYHr+P0qMEkZ7UHq30/pSpSAb8vvRU2BRQM//9k=",
122 | "display_url": "https://instagram.fbne3-1.fna.fbcdn.net/vp/ff493b6b24e6e2be7df1ec9644d5339c/5CD16638/t51.2885-15/e35/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
123 | "display_resources": [
124 | {
125 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/5150c80ce526c6f6bd4da78e4f57979f/5CBBD552/t51.2885-15/sh0.08/e35/p640x640/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
126 | "config_width": 640,
127 | "config_height": 799
128 | },
129 | {
130 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/54585546542f3fae7f25ab23d219fd75/5CB87296/t51.2885-15/sh0.08/e35/p750x750/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
131 | "config_width": 750,
132 | "config_height": 937
133 | },
134 | {
135 | "src": "https://instagram.fbne3-1.fna.fbcdn.net/vp/ff493b6b24e6e2be7df1ec9644d5339c/5CD16638/t51.2885-15/e35/49472607_1820670783526329_6546442839896910927_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
136 | "config_width": 1080,
137 | "config_height": 1349
138 | }
139 | ],
140 | "accessibility_caption": "Image may contain: dog",
141 | "is_video": false,
142 | "should_log_client_event": false,
143 | "tracking_token": "eyJ2ZXJzaW9uIjo1LCJwYXlsb2FkIjp7ImlzX2FdGGelBj5c190cmFjJa2VkIjpmYWxzZSwidXVpZCI6IjRlODVlYjAyYzdmYjRmMmViNWYwNzg1ODZlZjRhZTEwMTk1MzU2NDE4NDYzNTI2MzAwMCJ9LCJzaWduYXR1cmUiOiIifQ==",
144 | "edge_media_to_tagged_user": {
145 | "edges": []
146 | },
147 | "edge_media_to_caption": {
148 | "edges": [
149 | {
150 | "node": {
151 | "text": "Cool pic #dogs 👌🏻"
152 | }
153 | }
154 | ]
155 | },
156 | "caption_is_edited": false,
157 | "has_ranked_comments": false,
158 | "edge_media_to_comment": {
159 | "count": 0,
160 | "page_info": {
161 | "has_next_page": false,
162 | "end_cursor": null
163 | },
164 | "edges": []
165 | },
166 | "comments_disabled": false,
167 | "taken_at_timestamp": 1547103020,
168 | "edge_media_preview_like": {
169 | "count": 3,
170 | "edges": []
171 | },
172 | "edge_media_to_sponsor_user": {
173 | "edges": []
174 | },
175 | "location": null,
176 | "viewer_has_liked": false,
177 | "viewer_has_saved": false,
178 | "viewer_has_saved_to_collection": false,
179 | "viewer_in_photo_of_you": false,
180 | "viewer_can_reshare": true,
181 | "owner": {
182 | "id": "7050323018",
183 | "is_verified": false,
184 | "profile_pic_url": "https://instagram.fbne3-1.fna.fbcdn.net/vp/0859933bacb7ef085efcd513c7336f21/5CCBC50C/t51.2885-19/s150x150/47446882_612896971943840_3814256767933636272_n.jpg?_nc_ht=instagram.fbne3-1.fna.fbcdn.net",
185 | "username": "user.name",
186 | "blocked_by_viewer": false,
187 | "followed_by_viewer": false,
188 | "full_name": "Full name",
189 | "has_blocked_viewer": false,
190 | "is_private": false,
191 | "is_unpublished": false,
192 | "requested_by_viewer": false
193 | },
194 | "is_ad": false,
195 | "edge_web_media_to_related_media": {
196 | "edges": []
197 | }
198 | }
199 | }
200 | ]
201 |
202 | ```
203 |
204 | ## Media
205 | To download media as well as scrape metadata, include the `-d` flag. By default, Instamancer downloads the highest-quality image available for each post.
206 |
207 | By enabling full mode with `--full`, all images in albums are downloaded as well.
208 |
209 | Videos are downloaded when the `--video` flag is used along with `--full`.
210 |
211 | The default download location for media is `downloads/[endpoint]/[id]`. This can be changed with the `--downdir` flag.
212 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | See the command-line interface in action [here](https://scriptsmith.github.io/instamancer), and instructions and examples [here](../README.md#command-line)
4 |
5 | |Name|Description|
6 | |---------------------------------------|----------------------------------------------------------|
7 | |[Express server](server.ts) |Express server acting as an API endpoint |
8 | |[Page complexity plugin](complexity.ts)|Plugin that outputs the number of DOM elements on the page|
9 |
10 | Please suggest more examples with a pull request
11 |
--------------------------------------------------------------------------------
/examples/complexity.ts:
--------------------------------------------------------------------------------
1 | import * as instamancer from "instamancer";
2 | import {Response} from "puppeteer";
3 |
4 | class Complexity implements instamancer.IPlugin {
5 | private query: string;
6 |
7 | constructor(query: string) {
8 | this.query = query;
9 | }
10 |
11 | public async responseEvent(
12 | this: instamancer.IPluginContext, PostType>,
13 | res: Response,
14 | data: {[key: string]: any},
15 | ): Promise {
16 | const elementCount = await this.state.page.evaluate((query) => {
17 | return document.querySelectorAll(query).length;
18 | }, this.plugin.query);
19 | process.stdout.write(
20 | `${this.plugin.query} elements: ${elementCount}\n`,
21 | );
22 | }
23 | }
24 |
25 | const user = instamancer.createApi("user", "therock", {
26 | enableGrafting: false,
27 | plugins: [
28 | new Complexity("div"),
29 | new Complexity("span"),
30 | new Complexity("img"),
31 | ],
32 | silent: true,
33 | total: 500,
34 | });
35 |
36 | (async () => {
37 | const posts: instamancer.TPost[] = [];
38 | for await (const post of user.generator()) {
39 | posts.push(post);
40 | }
41 |
42 | process.stdout.write(`Total posts ${posts.length}`);
43 | })();
44 |
--------------------------------------------------------------------------------
/examples/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "instamancer-examples",
3 | "version": "1.0.0",
4 | "description": "Examples of instamancer usage",
5 | "author": "ScriptSmith",
6 | "license": "MIT-0",
7 | "dependencies": {
8 | "express": "^4.17.1",
9 | "instamancer": "file:..",
10 | "puppeteer": "^1.20.0"
11 | },
12 | "devDependencies": {
13 | "@types/puppeteer": "^1.20.2"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/examples/server.ts:
--------------------------------------------------------------------------------
1 | import express from "express";
2 | import * as instamancer from "instamancer";
3 |
4 | const app = express();
5 | const port = 3000;
6 |
7 | async function getPosts(tag: string): Promise {
8 | const hashtag = instamancer.createApi("hashtag", tag, {
9 | total: 5,
10 | });
11 | const posts = [];
12 |
13 | for await (const post of hashtag.generator()) {
14 | posts.push(post);
15 | }
16 |
17 | return posts;
18 | }
19 |
20 | let cachedPosts: instamancer.TPost[] = [];
21 |
22 | async function getCached() {
23 | cachedPosts = await getPosts("puppies");
24 | }
25 | setTimeout(getCached, 3000);
26 |
27 | app.get("/cached", async (req, res) => {
28 | res.json(cachedPosts);
29 | });
30 |
31 | app.get("/live", async (req, res) => {
32 | if ("tag" in req.params) {
33 | const posts = await getPosts(req.params.tag);
34 | res.json(posts);
35 | }
36 | });
37 |
38 | app.listen(port, () =>
39 | process.stdout.write(`Example app listening on port ${port}!\n`),
40 | );
41 |
--------------------------------------------------------------------------------
/examples/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | /* Basic Options */
4 | // "incremental": true, /* Enable incremental compilation */
5 | "target": "es5" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019' or 'ESNEXT'. */,
6 | "module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */,
7 | // "lib": [], /* Specify library files to be included in the compilation. */
8 | // "allowJs": true, /* Allow javascript files to be compiled. */
9 | // "checkJs": true, /* Report errors in .js files. */
10 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
11 | // "declaration": true, /* Generates corresponding '.d.ts' file. */
12 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
13 | // "sourceMap": true, /* Generates corresponding '.map' file. */
14 | // "outFile": "./", /* Concatenate and emit output to single file. */
15 | // "outDir": "./", /* Redirect output structure to the directory. */
16 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */
17 | // "composite": true, /* Enable project compilation */
18 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
19 | // "removeComments": true, /* Do not emit comments to output. */
20 | // "noEmit": true, /* Do not emit outputs. */
21 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */
22 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
23 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */
24 |
25 | /* Strict Type-Checking Options */
26 | "strict": true /* Enable all strict type-checking options. */,
27 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
28 | // "strictNullChecks": true, /* Enable strict null checks. */
29 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */
30 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
31 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
32 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
33 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */
34 |
35 | /* Additional Checks */
36 | // "noUnusedLocals": true, /* Report errors on unused locals. */
37 | // "noUnusedParameters": true, /* Report errors on unused parameters. */
38 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
39 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */
40 |
41 | /* Module Resolution Options */
42 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
43 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */
44 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
45 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
46 | // "typeRoots": [], /* List of folders to include type definitions from. */
47 | // "types": [], /* Type declaration files to be included in compilation. */
48 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
49 | "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */
50 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
51 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
52 |
53 | /* Source Map Options */
54 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
55 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
56 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
57 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */
58 |
59 | /* Experimental Options */
60 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
61 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/examples/tslint.json:
--------------------------------------------------------------------------------
1 | {
2 | "defaultSeverity": "error",
3 | "extends": ["tslint:recommended"],
4 | "jsRules": {},
5 | "rules": {},
6 | "rulesDirectory": []
7 | }
8 |
--------------------------------------------------------------------------------
/index.ts:
--------------------------------------------------------------------------------
1 | import {createApi} from "./src/api/api";
2 |
3 | export {
4 | Hashtag,
5 | Post,
6 | User,
7 | IOptions,
8 | createApi,
9 | IOptionsCommon,
10 | IOptionsFullApi,
11 | IOptionsRegular,
12 | } from "./src/api/api";
13 | export {Instagram} from "./src/api/instagram";
14 | export {TSearchResult, ISearchOptions} from "./src/api/search";
15 | export {TPost, TSinglePost, TFullApiPost} from "./src/api/types";
16 |
17 | export * from "./plugins";
18 |
--------------------------------------------------------------------------------
/man/instamancer.1:
--------------------------------------------------------------------------------
1 | .\" Manpage for instamancer.
2 | .TH Instamancer 1
3 | .SH NAME
4 | instamancer \- Scrape Instagram's API with Puppeteer
5 | .SH SYNOPSIS
6 | .B instamancer
7 | [\fIoptions\fR]
8 | .IR command
9 | .IR query
10 | .SH DESCRIPTION
11 | Instamancer is an Instagram scraper that uses Puppeteer to control a chromium / chrome browser instance and intercept requests made to APIs.
12 |
13 | Instamancer scrapes hashtags, users, search results, and individual posts.
14 |
15 | Both data and media can be scraped, and then saved to disk or uploaded to external object storage.
16 |
17 | The plugin system can be used to extend instamancer and add other functionality.
18 | .SH OPTIONS
19 | .TP
20 | .BR \-h ", " \-\-help
21 | Show the list of options and examples
22 | .SH SEE ALSO
23 | The Instamancer project and further documentation can be accessed at https://github.com/ScriptSmith/instamancer
24 | .SH BUGS
25 | Please report bugs at
26 | https://github.com/ScriptSmith/instamancer/issues
27 | .SH AUTHOR
28 | Adam Smith https://github.com/ScriptSmith
29 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "instamancer",
3 | "version": "3.3.1",
4 | "description": "Scrape the Instagram API with Puppeteer",
5 | "main": "index.js",
6 | "types": "index.d.ts",
7 | "bin": {
8 | "instamancer": "src/cli.js"
9 | },
10 | "man": [
11 | "./man/instamancer.1"
12 | ],
13 | "files": [
14 | "index.js",
15 | "index.d.ts",
16 | "src/**/*.js",
17 | "src/**/*.d.ts",
18 | "plugins/*.js",
19 | "plugins/*.d.ts",
20 | "plugins/**/*.js",
21 | "plugins/**/*.d.ts"
22 | ],
23 | "scripts": {
24 | "build": "tsc",
25 | "prepack": "tsc --declaration",
26 | "test": "jest --env=node",
27 | "test:ci": "jest --forceExit --env=node",
28 | "lint": "tslint -p tsconfig.json -p tests/tsconfig.json",
29 | "lint:fix": "npm run lint -- --fix",
30 | "prettier": "prettier --write \"{src,tests}/**/*.ts\"",
31 | "clean": "rimraf src/**/*{.js,.d.ts} src/*{.js,.d.ts} plugins/**/*{.js,.d.ts} plugins/*{.js,.d.ts} tests/**/*{.js,.d.ts} tests/*{.js,.d.ts} examples/*{.js,.d.ts} index{.js,.d.ts} *.log"
32 | },
33 | "author": "ScriptSmith",
34 | "license": "MIT",
35 | "keywords": [
36 | "instagram",
37 | "instagram api",
38 | "data mining",
39 | "scraping"
40 | ],
41 | "dependencies": {
42 | "await-lock": "^2.0.1",
43 | "aws-sdk": "^2.715.0",
44 | "axios": "^0.19.2",
45 | "chalk": "^4.1.0",
46 | "env-paths": "^2.2.0",
47 | "fp-ts": "^2.7.0",
48 | "io-ts": "^2.2.9",
49 | "io-ts-excess": "^1.0.1",
50 | "json2csv": "^5.0.1",
51 | "lodash": "^4.17.19",
52 | "puppeteer": "^5.2.0",
53 | "tmp": "^0.2.1",
54 | "uuid": "^8.2.0",
55 | "winston": "^3.3.3",
56 | "yargs": "^15.4.1"
57 | },
58 | "engines": {
59 | "node": ">=10.15.0"
60 | },
61 | "repository": {
62 | "type": "git",
63 | "url": "git@github.com:ScriptSmith/instamancer.git"
64 | },
65 | "devDependencies": {
66 | "@types/aws-sdk": "^2.7.0",
67 | "@types/concat-stream": "^1.6.0",
68 | "@types/express": "^4.17.7",
69 | "@types/jest": "^26.0.4",
70 | "@types/json2csv": "^5.0.1",
71 | "@types/node": "^14.0.23",
72 | "@types/tmp": "^0.2.0",
73 | "@types/uuid": "^8.0.0",
74 | "@types/yargs": "^15.0.5",
75 | "express": "^4.17.1",
76 | "husky": "^4.2.5",
77 | "jest": "^26.1.0",
78 | "lint-staged": "^10.2.11",
79 | "prettier": "^2.0.5",
80 | "rimraf": "^3.0.2",
81 | "transform-json-types": "^0.7.0",
82 | "ts-jest": "^26.1.3",
83 | "tslint": "^6.1.2",
84 | "typescript": "^3.9.7"
85 | },
86 | "jest": {
87 | "coverageDirectory": "./coverage/",
88 | "collectCoverage": true,
89 | "preset": "ts-jest",
90 | "transform": {
91 | "^.+\\.(ts|tsx)$": "ts-jest"
92 | }
93 | },
94 | "husky": {
95 | "hooks": {
96 | "pre-commit": "lint-staged && npm run lint"
97 | }
98 | },
99 | "prettier": {
100 | "trailingComma": "all",
101 | "arrowParens": "always",
102 | "bracketSpacing": false,
103 | "tabWidth": 4
104 | },
105 | "lint-staged": {
106 | "*.json": [
107 | "prettier --write",
108 | "git add"
109 | ],
110 | "*.ts": [
111 | "prettier --write",
112 | "tslint --fix",
113 | "git add"
114 | ]
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/plugins/README.md:
--------------------------------------------------------------------------------
1 | # Plugins
2 |
3 | Plugins allow you to modify instamancer's functionality and behavior while gathering data.
4 |
5 | The following internal plugins are included with instamancer (but not enabled by default):
6 |
7 | |Plugin |Description |
8 | |----------|-------------------------------------------------------------------|
9 | |LargeFirst|Increase the `first` parameter in API requests to ask for more data|
10 |
11 | ## Using plugins with the CLI
12 |
13 | Example:
14 |
15 | ```
16 | instamancer hashtag puppies -c1000 --plugin LargeFirst --plugin MyPlugin
17 | ```
18 |
19 | ## Using external plugins with the CLI
20 |
21 | To install external plugins, you need to clone and install instamancer from source
22 |
23 | Steps:
24 |
25 | 1. Clone the instamancer repository
26 | 2. Install instamancer's dependencies
27 | 3. Install the plugin with npm / yarn
28 | 4. Add the plugin to `plugins/plugins/index.ts`
29 |
30 | Example:
31 |
32 |
33 | ``` typescript
34 | export { MyPlugin } from "myplugin";
35 | ```
36 |
37 | 5. Install instamancer
38 | 1. You can skip this step if you want to run the CLI from source
39 | 6. Run the CLI with the plugin:
40 |
41 |
42 | Example:
43 |
44 |
45 | ```
46 | instamancer hashtag puppies -c100 --plugin MyPlugin
47 | ```
48 |
49 | ## Using plugins with the module
50 |
51 | Add the plugin to the `options` :
52 |
53 | ``` typescript
54 | import * as instamancer from ".";
55 |
56 | const options: instamancer.IOptions = {
57 | plugins: [new instamancer.plugins.LargeFirst()],
58 | silent: true,
59 | total: 100,
60 | };
61 | const hashtag = instamancer.createApi("hashtag", "puppies", options);
62 |
63 | (async () => {
64 | for await (const post of hashtag.generator()) {
65 | console.log(post);
66 | }
67 | })();
68 |
69 | ```
70 |
--------------------------------------------------------------------------------
/plugins/index.ts:
--------------------------------------------------------------------------------
1 | import * as allPlugins from "./plugins";
2 |
3 | export const plugins = allPlugins;
4 | export * from "./plugin";
5 |
--------------------------------------------------------------------------------
/plugins/plugin.ts:
--------------------------------------------------------------------------------
1 | import * as puppeteer from "puppeteer";
2 | import {Instagram, TFullApiPost, TPost, TSearchResult, TSinglePost} from "..";
3 |
4 | export type DType = TPost | TSinglePost | TFullApiPost | TSearchResult;
5 |
6 | export interface IPluginContext {
7 | plugin: Plugin;
8 | state: Instagram;
9 | }
10 |
11 | export interface IPlugin {
12 | constructionEvent?(this: IPluginContext, PostType>): void;
13 |
14 | requestEvent?(
15 | this: IPluginContext, PostType>,
16 | req: puppeteer.Request,
17 | overrides: puppeteer.Overrides,
18 | ): Promise;
19 |
20 | responseEvent?(
21 | this: IPluginContext, PostType>,
22 | res: puppeteer.Response,
23 | data: {[key: string]: any},
24 | ): Promise;
25 |
26 | postPageEvent?(
27 | this: IPluginContext, PostType>,
28 | data: PostType,
29 | ): Promise;
30 |
31 | graftingEvent?(
32 | this: IPluginContext, PostType>,
33 | ): Promise;
34 | }
35 |
36 | export enum AsyncPluginEvents {
37 | browser,
38 | grafting,
39 | postPage,
40 | request,
41 | response,
42 | }
43 |
44 | export type AsyncPluginEventsType = keyof typeof AsyncPluginEvents;
45 |
46 | export enum SyncPluginEvents {
47 | construction,
48 | }
49 |
50 | export type SyncPluginEventsType = keyof typeof SyncPluginEvents;
51 |
52 | export type PluginEventsType = SyncPluginEventsType | AsyncPluginEventsType;
53 |
--------------------------------------------------------------------------------
/plugins/plugins/index.ts:
--------------------------------------------------------------------------------
1 | export {LargeFirst} from "./largeFirst";
2 |
3 | // Add your own plugins here
4 |
--------------------------------------------------------------------------------
/plugins/plugins/largeFirst.ts:
--------------------------------------------------------------------------------
1 | import {Overrides, Request} from "puppeteer";
2 | import * as querystring from "querystring";
3 | import {format as urlFormat, parse as urlParse} from "url";
4 | import {IPlugin, IPluginContext} from "../plugin";
5 |
6 | export class LargeFirst implements IPlugin {
7 | public constructionEvent(
8 | this: IPluginContext, PostType>,
9 | ): void {
10 | this.state.jumpSize = 150;
11 | }
12 |
13 | public async requestEvent(req: Request, overrides: Overrides) {
14 | const url = overrides["url"] ? overrides["url"] : req.url();
15 | const parsedUrl = urlParse(url);
16 | const query = querystring.parse(parsedUrl.query);
17 | const variables = JSON.parse(query["variables"] as string);
18 |
19 | variables.first = 50;
20 |
21 | query.variables = JSON.stringify(variables);
22 | parsedUrl.search = "?" + querystring.stringify(query);
23 | overrides["url"] = urlFormat(parsedUrl);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/api/api.ts:
--------------------------------------------------------------------------------
1 | import {Type} from "io-ts";
2 | import {Browser} from "puppeteer";
3 | import * as winston from "winston";
4 | import {DType, IPlugin} from "../../plugins";
5 | import {Instagram} from "./instagram";
6 | import {
7 | ISearchOptions,
8 | ISearchOptionsPlugins,
9 | Search,
10 | TSearchResult,
11 | } from "./search";
12 | import {
13 | FullApiPost,
14 | Post as PostValidator,
15 | SinglePost,
16 | TFullApiPost,
17 | TPost,
18 | TSinglePost,
19 | } from "./types";
20 |
21 | /**
22 | * Optional arguments for the API
23 | */
24 | export interface IOptionsCommon {
25 | // Total posts to download. 0 for unlimited
26 | total?: number;
27 |
28 | // Run Chrome in headless mode
29 | headless?: boolean;
30 |
31 | // Logging events
32 | logger?: winston.Logger;
33 |
34 | // Run without output to stdout
35 | silent?: boolean;
36 |
37 | // Time to sleep between interactions with the page
38 | sleepTime?: number;
39 |
40 | // Throw an error if type validation has been failed
41 | strict?: boolean;
42 |
43 | // Time to sleep when rate-limited
44 | hibernationTime?: number;
45 |
46 | // Enable the grafting process
47 | enableGrafting?: boolean;
48 |
49 | // Use the same browser instance when grafting
50 | sameBrowser?: boolean;
51 |
52 | // Extract the full amount of information from the API
53 | fullAPI?: boolean;
54 |
55 | // Use a proxy in Chrome to connect to Instagram
56 | proxyURL?: string;
57 |
58 | // Location of the chromium / chrome binary executable
59 | executablePath?: string;
60 |
61 | // Custom io-ts validator
62 | validator?: Type;
63 |
64 | // Pass puppeter Browser instance from outside.
65 | // Be careful to close Browser by yourself, when there is no need in it anymore.
66 | browserInstance?: Browser;
67 | }
68 |
69 | export interface IOptionsFullApi extends IOptionsCommon {
70 | fullAPI: true;
71 | }
72 |
73 | export interface IOptionsRegular extends IOptionsCommon {
74 | fullAPI?: false;
75 | }
76 |
77 | export interface IOptionsFullApiPlugins extends IOptionsFullApi {
78 | plugins?: IPlugin[];
79 | }
80 |
81 | export interface IOptionsRegularPlugins extends IOptionsRegular {
82 | plugins?: IPlugin[];
83 | }
84 |
85 | export type IOptions =
86 | | IOptionsFullApi
87 | | IOptionsRegular
88 | | IOptionsFullApiPlugins
89 | | IOptionsRegularPlugins;
90 |
91 | /**
92 | * An Instagram post API wrapper
93 | */
94 | export class Post extends Instagram {
95 | // Post ids
96 | private readonly ids: string[];
97 |
98 | constructor(ids: string[], options: IOptions = {}) {
99 | // fullAPI option makes no sense for Post class
100 | // But usage with fullAPI option brings an extra post, because of scrapeDefaultPosts
101 | // So we force it to be disabled
102 | options.fullAPI = false;
103 | super(
104 | "https://instagram.com/p/[id]",
105 | ids[0],
106 | "",
107 | "",
108 | options,
109 | SinglePost,
110 | );
111 | this.ids = ids;
112 | }
113 |
114 | /**
115 | * Get the post metadata
116 | */
117 | protected async getNext() {
118 | for (const id of this.ids) {
119 | this.id = id;
120 | await this.postPage(id, 5);
121 | await this.sleep(2);
122 | }
123 | this.finished = true;
124 | }
125 | }
126 |
127 | const getPageValidator = (options: IOptions) =>
128 | options.fullAPI ? FullApiPost : PostValidator;
129 |
130 | export type InstagramPostClass = Hashtag | User;
131 | export type InstagramFullPostClass = Hashtag | User;
132 |
133 | export function createApi(
134 | type: "search",
135 | query: string,
136 | options?: ISearchOptions | ISearchOptionsPlugins,
137 | ): Search;
138 | export function createApi(type: "post", id: string[], options?: IOptions): Post;
139 | export function createApi(
140 | type: "hashtag" | "user",
141 | id: string,
142 | options?: IOptionsRegular | IOptionsRegularPlugins,
143 | ): InstagramPostClass;
144 | export function createApi(
145 | type: "hashtag" | "user",
146 | id: string,
147 | options?: IOptionsFullApi | IOptionsFullApiPlugins,
148 | ): InstagramFullPostClass;
149 |
150 | export function createApi(
151 | type: "hashtag" | "user" | "post" | "search",
152 | id: string | string[],
153 | options?: IOptions,
154 | ): Post | InstagramPostClass | InstagramFullPostClass | Search {
155 | let ClassConstructor: typeof Hashtag | typeof User;
156 | switch (type) {
157 | case "search":
158 | return new Search(id as string, options as ISearchOptions);
159 | case "post":
160 | return new Post(id as string[], options);
161 | case "hashtag":
162 | ClassConstructor = Hashtag;
163 | break;
164 | case "user":
165 | ClassConstructor = User;
166 | break;
167 | }
168 | if (options.fullAPI) {
169 | return new ClassConstructor(id as string, options);
170 | }
171 | return new ClassConstructor(id as string, options);
172 | }
173 |
174 | /**
175 | * An Instagram hashtag API wrapper
176 | */
177 | export class Hashtag extends Instagram {
178 | constructor(id: string, options: IOptions = {}) {
179 | super(
180 | "https://instagram.com/explore/tags/[id]",
181 | id,
182 | "data.hashtag.edge_hashtag_to_media.page_info",
183 | "data.hashtag.edge_hashtag_to_media.edges",
184 | options,
185 | getPageValidator(options),
186 | );
187 | }
188 | }
189 |
190 | /**
191 | * An Instagram user API wrapper
192 | */
193 | export class User extends Instagram {
194 | defaultPageFunctions = [
195 | /* istanbul ignore next */
196 | () => {
197 | let morePostsIntervalCounter = 0;
198 | const morePostsInterval = setInterval(() => {
199 | const searchDiv = Array.from(
200 | document.getElementsByTagName("div"),
201 | ).filter((d) =>
202 | d.innerHTML.startsWith("Show More Posts from"),
203 | )[0];
204 |
205 | morePostsIntervalCounter++;
206 |
207 | if (searchDiv !== undefined) {
208 | searchDiv.parentElement.parentElement.click();
209 | clearInterval(morePostsInterval);
210 | } else if (morePostsIntervalCounter > 10) {
211 | clearInterval(morePostsInterval);
212 | }
213 | }, 1000);
214 | },
215 | ];
216 |
217 | constructor(id: string, options: IOptions = {}) {
218 | super(
219 | "https://instagram.com/[id]",
220 | id,
221 | "data.user.edge_owner_to_timeline_media.page_info",
222 | "data.user.edge_owner_to_timeline_media.edges",
223 | options,
224 | getPageValidator(options),
225 | );
226 | }
227 | }
228 |
--------------------------------------------------------------------------------
/src/api/instagram.ts:
--------------------------------------------------------------------------------
1 | import AwaitLock from "await-lock";
2 | import chalk from "chalk";
3 | import {isLeft} from "fp-ts/lib/Either";
4 | import {Type} from "io-ts";
5 | import {PathReporter} from "io-ts/lib/PathReporter";
6 | import {ThrowReporter} from "io-ts/lib/ThrowReporter";
7 | import * as _ from "lodash/object";
8 | import {
9 | Browser,
10 | Headers,
11 | launch,
12 | LaunchOptions,
13 | Page,
14 | Request,
15 | Response,
16 | } from "puppeteer";
17 | import * as winston from "winston";
18 | import {
19 | AsyncPluginEventsType,
20 | IPlugin,
21 | IPluginContext,
22 | PluginEventsType,
23 | SyncPluginEvents,
24 | SyncPluginEventsType,
25 | } from "../../plugins";
26 | import {IOptions} from "./api";
27 | import {PostIdSet} from "./postIdSet";
28 |
29 | type AsyncPluginFunctions = {
30 | [key in AsyncPluginEventsType]: ((...args: any[]) => Promise)[];
31 | };
32 | type SyncPluginFunctions = {
33 | [key in SyncPluginEventsType]: ((...args: any[]) => void)[];
34 | };
35 | type PluginFunctions = AsyncPluginFunctions & SyncPluginFunctions;
36 |
37 | /**
38 | * Instagram API wrapper
39 | */
40 | export class Instagram {
41 | /**
42 | * Apply defaults to undefined options
43 | */
44 | private static defaultOptions(options: IOptions) {
45 | if (options.enableGrafting === undefined) {
46 | options.enableGrafting = true;
47 | }
48 | if (options.sameBrowser === undefined) {
49 | options.sameBrowser = false;
50 | }
51 | if (options.fullAPI === undefined) {
52 | options.fullAPI = false;
53 | }
54 | if (options.headless === undefined) {
55 | options.headless = true;
56 | }
57 | if (options.logger === undefined) {
58 | options.logger = winston.createLogger({
59 | silent: true,
60 | });
61 | }
62 | if (options.silent === undefined) {
63 | options.silent = true;
64 | }
65 | if (options.sleepTime === undefined) {
66 | options.sleepTime = 2;
67 | }
68 | if (options.hibernationTime === undefined) {
69 | options.hibernationTime = 60 * 20;
70 | }
71 | if (options.total === undefined) {
72 | options.total = 0;
73 | }
74 | return options;
75 | }
76 |
77 | // Resource identifier
78 | public id: string;
79 | public url: string;
80 |
81 | // Iteration state
82 | public started: boolean = false;
83 | public paused: boolean = false;
84 | public finished: boolean = false;
85 | public finishedReason: FinishedReasons;
86 |
87 | // Instagram URLs
88 | public catchURL: string = "https://www.instagram.com/graphql/query";
89 | public postURL: string = "https://www.instagram.com/p/";
90 | public defaultPostURL: string = "https://www.instagram.com/p/";
91 |
92 | // Number of jumps before grafting
93 | public jumpMod: number = 100;
94 |
95 | // Depth of jumps
96 | public jumpSize: number = 2;
97 |
98 | // Puppeteer resources
99 | public page: Page;
100 |
101 | // Logging object
102 | public logger: winston.Logger;
103 |
104 | // Implementation-specific page functions
105 | public defaultPageFunctions: (() => void)[] = [];
106 |
107 | // Validations
108 | private readonly strict: boolean = false;
109 | private readonly validator: Type;
110 |
111 | // Puppeteer state
112 | private browser: Browser;
113 | private browserDisconnected: boolean = true;
114 | private readonly browserInstance?: Browser;
115 | private readonly headless: boolean;
116 |
117 | // Array of scraped posts and lock
118 | private postBuffer: PostType[] = [];
119 | private postBufferLock: AwaitLock = new AwaitLock();
120 |
121 | // Request and Response buffers and locks
122 | private requestBuffer: Request[] = [];
123 | private requestBufferLock: AwaitLock = new AwaitLock();
124 | private responseBuffer: Response[] = [];
125 | private responseBufferLock: AwaitLock = new AwaitLock();
126 |
127 | // Get full amount of data from API
128 | private readonly fullAPI: boolean = false;
129 | private pagePromises: Promise[] = [];
130 |
131 | // Grafting state
132 | private readonly enableGrafting: boolean = true;
133 | private readonly sameBrowser: boolean = false;
134 | private graft: boolean = false;
135 | private graftURL: string = null;
136 | private graftHeaders: Headers = null;
137 | private foundGraft: boolean = false;
138 |
139 | // Hibernation due to rate limiting
140 | private hibernate: boolean = false;
141 | private readonly hibernationTime: number = 60 * 20; // 20 minutes
142 |
143 | // Number of jumps before exiting because lack of data
144 | private failedJumps: number = 20;
145 | private responseFromAPI: boolean = false;
146 |
147 | // Strings denoting the access methods of API objects
148 | private readonly pageQuery: string;
149 | private readonly edgeQuery: string;
150 |
151 | // Cache of post ids
152 | private postIds: PostIdSet;
153 |
154 | // Iteration variables
155 | private readonly total: number;
156 | private index: number = 0;
157 | private jumps: number = 0;
158 |
159 | // Number of times to attempt to visit url initially
160 | private readonly maxPageUrlAttempts = 3;
161 | private pageUrlAttempts = 0;
162 | private postPageRetries = 5;
163 |
164 | // Output
165 | private readonly silent: boolean = false;
166 | private writeLock: AwaitLock = new AwaitLock();
167 |
168 | // Sleep time remaining
169 | private sleepRemaining: number = 0;
170 |
171 | // Length of time to sleep for
172 | private readonly sleepTime: number = 2;
173 |
174 | // Proxy for Instagram connection
175 | private readonly proxyURL: string;
176 |
177 | // Location of chromium / chrome binary executable
178 | private readonly executablePath: string;
179 |
180 | // Plugins to be run
181 | private pluginFunctions: PluginFunctions = {
182 | browser: [],
183 | construction: [],
184 | grafting: [],
185 | postPage: [],
186 | request: [],
187 | response: [],
188 | };
189 |
190 | /**
191 | * Create API wrapper instance
192 | * @param endpoint the url for the type of resource to scrape
193 | * @param id the identifier for the resource
194 | * @param pageQuery the query to identify future pages in the nested API structure
195 | * @param edgeQuery the query to identify posts in the nested API structure
196 | * @param options configuration details
197 | * @param validator response type validator
198 | */
199 | constructor(
200 | endpoint: string,
201 | id: string,
202 | pageQuery: string,
203 | edgeQuery: string,
204 | options: IOptions = {},
205 | validator: Type,
206 | ) {
207 | this.id = id;
208 | this.postIds = new PostIdSet();
209 | this.url = endpoint.replace("[id]", id);
210 |
211 | options = Instagram.defaultOptions(options);
212 | this.total = options.total;
213 | this.pageQuery = pageQuery;
214 | this.edgeQuery = edgeQuery;
215 | this.browserInstance = options.browserInstance;
216 | this.headless = options.headless;
217 | this.logger = options.logger;
218 | this.silent = options.silent;
219 | this.strict = options.strict;
220 | this.enableGrafting = options.enableGrafting;
221 | this.sameBrowser = options.sameBrowser;
222 | this.sleepTime = options.sleepTime;
223 | this.hibernationTime = options.hibernationTime;
224 | this.fullAPI = options.fullAPI;
225 | this.proxyURL = options.proxyURL;
226 | this.executablePath = options.executablePath;
227 | this.validator = options.validator || validator;
228 |
229 | this.addPlugins(options["plugins"]);
230 | this.executePlugins("construction");
231 | }
232 |
233 | /**
234 | * Toggle pausing data collection
235 | */
236 | public pause() {
237 | this.paused = !this.paused;
238 | }
239 |
240 | /**
241 | * Toggle prolonged pausing
242 | */
243 | public toggleHibernation() {
244 | this.hibernate = true;
245 | }
246 |
247 | /**
248 | * Force the API to stop
249 | */
250 | public async forceStop(force?: boolean) {
251 | if (!force && !this.started) {
252 | return;
253 | }
254 | this.started = false;
255 | this.finish(FinishedReasons.FORCED_STOP);
256 | try {
257 | this.requestBufferLock.release();
258 | // tslint:disable-next-line: no-empty
259 | } catch (e) {}
260 | try {
261 | this.responseBufferLock.release();
262 | // tslint:disable-next-line: no-empty
263 | } catch (e) {}
264 | await this.stop();
265 | }
266 |
267 | /**
268 | * Generator of posts on page
269 | */
270 | public async *generator(): AsyncIterableIterator {
271 | // Start if haven't done so already
272 | if (!this.started) {
273 | await this.start();
274 | }
275 |
276 | while (true) {
277 | // Get more posts
278 | await this.getNext();
279 |
280 | // Yield posts from buffer
281 | let post = await this.postPop();
282 | while (post) {
283 | yield post;
284 | post = await this.postPop();
285 | }
286 |
287 | // End loop when finished, check for pagePromises if fullAPI
288 | if (this.finished && this.pagePromises.length === 0) {
289 | break;
290 | }
291 | }
292 | await this.stop();
293 |
294 | // Add newline to end of output
295 | if (!this.silent) {
296 | process.stdout.write("\n");
297 | }
298 | }
299 |
300 | /**
301 | * Construct page and add listeners
302 | */
303 | public async start() {
304 | let pageConstructed: boolean;
305 | this.pageUrlAttempts = 0;
306 | while (this.pageUrlAttempts++ < this.maxPageUrlAttempts) {
307 | pageConstructed = await this.constructPage();
308 | if (pageConstructed) {
309 | break;
310 | }
311 | }
312 | if (!pageConstructed) {
313 | await this.forceStop(true);
314 | throw new Error("Failed to visit URL");
315 | }
316 |
317 | // Build page and visit url
318 | await this.executePlugins("browser");
319 |
320 | this.started = true;
321 |
322 | // Add event listeners for requests and responses
323 | await this.page.setRequestInterception(true);
324 | this.page.on("request", (req) => this.interceptRequest(req));
325 | this.page.on("response", (res) => this.interceptResponse(res));
326 | this.page.on("requestfailed", (res) => this.interceptFailure(res));
327 | this.page.on("console", (message) =>
328 | this.logger.info("Console log", {message}),
329 | );
330 |
331 | // Ignore dialog boxes
332 | this.page.on("dialog", (dialog) => dialog.dismiss());
333 |
334 | // Log errors
335 | /* istanbul ignore next */
336 | this.page.on("error", (error) =>
337 | this.logger.error("Console error", {error}),
338 | );
339 |
340 | // Gather initial posts from web page
341 | if (this.fullAPI) {
342 | await this.scrapeDefaultPosts();
343 | }
344 | }
345 |
346 | /**
347 | * Match the url to the url used in API requests
348 | */
349 | public matchURL(url: string) {
350 | return url.startsWith(this.catchURL) && !url.includes("include_reel");
351 | }
352 |
353 | /**
354 | * Close the page and browser
355 | */
356 | protected async stop() {
357 | await this.progress(Progress.CLOSING);
358 |
359 | // Remove listeners
360 | if (!this.page.isClosed()) {
361 | this.page.removeAllListeners("request");
362 | this.page.removeAllListeners("response");
363 | this.page.removeAllListeners("requestfailed");
364 | }
365 |
366 | // Clear request buffers
367 | await this.requestBufferLock.acquireAsync();
368 | this.requestBuffer = [];
369 | this.requestBufferLock.release();
370 |
371 | // Clear response buffers
372 | await this.responseBufferLock.acquireAsync();
373 | this.responseBuffer = [];
374 | this.responseBufferLock.release();
375 |
376 | // Wait for pagePromises to empty
377 | while (true) {
378 | if (this.pagePromises.length === 0) {
379 | break;
380 | } else {
381 | /* istanbul ignore next */
382 | await this.sleep(1);
383 | }
384 | }
385 |
386 | // Close page
387 | if (!this.page.isClosed()) {
388 | await this.page.close();
389 | }
390 |
391 | if (!this.browserDisconnected && !this.browserInstance) {
392 | await this.browser.close();
393 | }
394 | }
395 |
396 | /**
397 | * Finish retrieving data for the generator
398 | */
399 | protected finish(reason: FinishedReasons) {
400 | this.finished = true;
401 | this.finishedReason = reason;
402 | this.logger.info("Finished collecting", {reason});
403 | }
404 |
405 | /**
406 | * Process the requests in the request buffer
407 | */
408 | protected async processRequests() {
409 | await this.requestBufferLock.acquireAsync();
410 |
411 | let newApiRequest = false;
412 | for (const req of this.requestBuffer) {
413 | // Match url
414 | if (!this.matchURL(req.url())) {
415 | continue;
416 | } else {
417 | newApiRequest = true;
418 | }
419 |
420 | // Begin grafting if required, else continue the request
421 | if (this.graft) {
422 | if (this.foundGraft === false) {
423 | // Gather details
424 | this.graftURL = req.url();
425 | this.graftHeaders = req.headers();
426 | this.foundGraft = true;
427 |
428 | // Cancel request
429 | await req.abort();
430 | } else {
431 | // Swap request
432 | const overrides = {
433 | headers: this.graftHeaders,
434 | url: this.graftURL,
435 | };
436 | await this.executePlugins("request", req, overrides);
437 | await req.continue(overrides);
438 |
439 | // Reset grafting data
440 | this.graft = false;
441 | this.foundGraft = false;
442 | this.graftURL = null;
443 | this.graftHeaders = null;
444 | }
445 |
446 | // Stop reading requests
447 | break;
448 | } else {
449 | const overrides = {};
450 | this.executePlugins("request", req, overrides);
451 | await req.continue(overrides);
452 | }
453 | }
454 |
455 | // Clear buffer and release
456 | this.requestBuffer = [];
457 | this.requestBufferLock.release();
458 |
459 | if (this.foundGraft && newApiRequest) {
460 | // Restart browser and page, clearing all buffers
461 | await this.stop();
462 | await this.start();
463 | }
464 | }
465 |
466 | /**
467 | * Process the responses in the response buffer
468 | */
469 | protected async processResponses() {
470 | await this.responseBufferLock.acquireAsync();
471 |
472 | for (const res of this.responseBuffer) {
473 | // Match url
474 | if (!this.matchURL(res.url())) {
475 | continue;
476 | }
477 |
478 | // Acknowledge receipt of response
479 | this.responseFromAPI = true;
480 |
481 | // Get JSON data
482 | let data: unknown;
483 | try {
484 | data = await res.json();
485 | if (typeof data !== "object") {
486 | this.logger.error("Response data is not an object", {data});
487 | continue;
488 | }
489 | } catch (error) {
490 | this.logger.error("Error processing response JSON", {
491 | data,
492 | error,
493 | });
494 | continue;
495 | }
496 |
497 | // Emit event
498 | this.executePlugins("response", res, data);
499 |
500 | // Check for rate limiting
501 | if (data && "status" in data && data["status"] === "fail") {
502 | this.logger.info("Rate limited");
503 | this.hibernate = true;
504 | continue;
505 | }
506 |
507 | // Check for next page
508 | if (
509 | !(
510 | _.get(data, this.pageQuery + ".has_next_page", false) &&
511 | _.get(data, this.pageQuery + ".end_cursor", false)
512 | )
513 | ) {
514 | this.logger.info("No posts remaining", {data});
515 | this.finish(FinishedReasons.API_FINISHED);
516 | }
517 |
518 | await this.processResponseData(data);
519 | }
520 |
521 | // Clear buffer and release
522 | this.responseBuffer = [];
523 | this.responseBufferLock.release();
524 | }
525 |
526 | protected async processResponseData(data: unknown) {
527 | // Get posts
528 | const posts = _.get(data, this.edgeQuery, []);
529 | for (const post of posts) {
530 | const postId = post["node"]["id"];
531 |
532 | // Check it hasn't already been cached
533 | const contains = this.postIds.add(postId);
534 | if (contains) {
535 | this.logger.info("Duplicate id found", {postId});
536 | continue;
537 | }
538 |
539 | // Add to postBuffer
540 | if (this.index < this.total || this.total === 0) {
541 | this.index++;
542 | if (this.fullAPI) {
543 | this.pagePromises.push(
544 | this.postPage(
545 | post["node"]["shortcode"],
546 | this.postPageRetries,
547 | ),
548 | );
549 | } else {
550 | await this.addToPostBuffer(post);
551 | }
552 | } else {
553 | this.finish(FinishedReasons.TOTAL_REACHED_API);
554 | break;
555 | }
556 | }
557 | }
558 |
559 | /**
560 | * Open a post in a new page, then extract its metadata
561 | */
562 | protected async postPage(post: string, retries: number) {
563 | // Create page
564 | const postPage = await this.browser.newPage();
565 | await postPage.setRequestInterception(true);
566 | postPage.on("request", async (req) => {
567 | if (!req.url().includes("/p/" + post)) {
568 | await req.abort();
569 | } else {
570 | await req.continue();
571 | }
572 | });
573 | postPage.on("requestfailed", async (req) => this.interceptFailure(req));
574 |
575 | // Visit post and read state
576 | let parsed;
577 | try {
578 | await postPage.goto(this.postURL + post + "/");
579 | } catch (error) {
580 | await this.handlePostPageError(
581 | postPage,
582 | error,
583 | "Couldn't navigate to page",
584 | post,
585 | retries,
586 | );
587 | return;
588 | }
589 |
590 | // Load data from memory
591 | let data;
592 | try {
593 | /* istanbul ignore next */
594 | data = await postPage.evaluate(async () => {
595 | // Wait for _sharedData value to be set
596 | await new Promise((resolve) => {
597 | let i = 0;
598 | const findSharedData = setInterval(() => {
599 | if (window["_sharedData"] !== undefined || i++ > 5) {
600 | resolve();
601 | clearInterval(findSharedData);
602 | }
603 | }, 2000);
604 | });
605 |
606 | return JSON.stringify(
607 | window["_sharedData"].entry_data.PostPage[0].graphql,
608 | );
609 | });
610 | } catch (error) /* istanbul ignore next */ {
611 | await this.handlePostPageError(
612 | postPage,
613 | error,
614 | "Couldn't evaluate on page",
615 | post,
616 | retries,
617 | );
618 | return;
619 | }
620 |
621 | // Close page
622 | await postPage.close();
623 |
624 | // Parse data to PostType
625 | try {
626 | parsed = JSON.parse(data) as PostType;
627 | } catch (error) /* istanbul ignore next */ {
628 | await this.handlePostPageError(
629 | postPage,
630 | error,
631 | "Couldn't parse page data",
632 | post,
633 | retries,
634 | );
635 | return;
636 | }
637 |
638 | await this.executePlugins("postPage", parsed);
639 | await this.addToPostBuffer(parsed);
640 | }
641 |
642 | private async handlePostPageError(
643 | page: Page,
644 | error: Error,
645 | message: string,
646 | post: string,
647 | retries: number,
648 | ) {
649 | // Log error and wait
650 | this.logger.error(message, {error});
651 | await this.progress(Progress.ABORTED);
652 | await this.sleep(2);
653 |
654 | // Close existing attempt
655 | if (!page.isClosed()) {
656 | await page.close();
657 | }
658 |
659 | // Retry
660 | if (retries > 0) {
661 | await this.postPage(post, --retries);
662 | }
663 | }
664 |
665 | protected async validatePost(post: PostType) {
666 | const validationResult = this.validator.decode(post);
667 | if (this.strict) {
668 | ThrowReporter.report(validationResult);
669 | return;
670 | }
671 | if (isLeft(validationResult)) {
672 | const validationReporter = PathReporter.report(validationResult);
673 | this.logger.warn(
674 | `
675 | Warning! The Instagram API has been changed since this version of instamancer was released.
676 | More info: https://scriptsmith.github.io/instamancer/api-change
677 | `,
678 | {validationReporter, post},
679 | );
680 | }
681 | }
682 |
683 | /**
684 | * Stimulate the page until responses gathered
685 | */
686 | protected async getNext() {
687 | await this.progress(Progress.SCRAPING);
688 | while (true) {
689 | // Process results (if any)
690 | await this.processRequests();
691 | await this.processResponses();
692 |
693 | // Finish page promises
694 | if (this.pagePromises.length > 0) {
695 | await this.progress(Progress.BRANCHING);
696 | await Promise.all(this.pagePromises);
697 | this.pagePromises = [];
698 | }
699 |
700 | // Check if finished
701 | if (this.finished) {
702 | break;
703 | }
704 |
705 | // Pause if paused
706 | await this.waitResume();
707 |
708 | // Interact with page to stimulate request
709 | await this.jump();
710 |
711 | // Stop if no data is being gathered
712 | if (this.jumps === this.failedJumps) {
713 | if (this.fullAPI) {
714 | if (!this.responseFromAPI) {
715 | this.finish(FinishedReasons.NO_RESPONSE);
716 | }
717 | } else if (this.index === 0) {
718 | this.finish(FinishedReasons.NO_INCREMENT);
719 |
720 | const pageContent = {content: ""};
721 | try {
722 | pageContent.content = await this.page.content();
723 | } catch (e) {
724 | // No content
725 | }
726 |
727 | this.logger.error(
728 | "Page failed to make requests",
729 | pageContent,
730 | );
731 | break;
732 | }
733 | }
734 |
735 | // Enable grafting if required
736 | if (this.jumps % this.jumpMod === 0) {
737 | await this.initiateGraft();
738 | }
739 |
740 | // Sleep
741 | await this.sleep(this.sleepTime);
742 |
743 | // Hibernate if rate-limited
744 | if (this.hibernate) {
745 | await this.sleep(this.hibernationTime);
746 | this.hibernate = false;
747 | }
748 |
749 | // Break if posts in buffer
750 | await this.postBufferLock.acquireAsync();
751 | const posts = this.postBuffer.length;
752 | this.postBufferLock.release();
753 | if (posts > 0) {
754 | break;
755 | }
756 | }
757 | }
758 |
759 | /**
760 | * Halt execution
761 | * @param time Seconds
762 | */
763 | protected async sleep(time: number) {
764 | for (let i = time; i > 0; i--) {
765 | this.sleepRemaining = i;
766 | await this.progress(Progress.SCRAPING);
767 |
768 | await new Promise((resolve) => {
769 | setTimeout(resolve, i >= 1 ? 1000 : i * 1000);
770 | });
771 | }
772 | this.sleepRemaining = 0;
773 | await this.progress(Progress.SCRAPING);
774 | }
775 |
776 | /**
777 | * Create the browser and page, then visit the url
778 | */
779 | private async constructPage(): Promise {
780 | // Browser args
781 | const args = [];
782 | /* istanbul ignore if */
783 | if (process.env.NO_SANDBOX) {
784 | args.push("--no-sandbox");
785 | args.push("--disable-setuid-sandbox");
786 | }
787 | if (this.proxyURL !== undefined) {
788 | args.push("--proxy-server=" + this.proxyURL);
789 | }
790 |
791 | // Browser launch options
792 | const options: LaunchOptions = {
793 | args,
794 | headless: this.headless,
795 | };
796 | if (this.executablePath !== undefined) {
797 | options.executablePath = this.executablePath;
798 | }
799 |
800 | // Launch browser
801 | if (this.browserInstance) {
802 | await this.progress(Progress.LAUNCHING);
803 | this.browser = this.browserInstance;
804 | this.browserDisconnected = !this.browser.isConnected();
805 | this.browser.on(
806 | "disconnected",
807 | () => (this.browserDisconnected = true),
808 | );
809 | } else if (!this.sameBrowser || (this.sameBrowser && !this.started)) {
810 | await this.progress(Progress.LAUNCHING);
811 | this.browser = await launch(options);
812 | this.browserDisconnected = false;
813 | this.browser.on(
814 | "disconnected",
815 | () => (this.browserDisconnected = true),
816 | );
817 | }
818 |
819 | // New page
820 | this.page = await this.browser.newPage();
821 | await this.progress(Progress.OPENING);
822 |
823 | // Attempt to visit URL
824 | try {
825 | await this.page.goto(this.url);
826 |
827 | // Check page loads
828 | /* istanbul ignore next */
829 | const pageLoaded = await this.page.evaluate(() => {
830 | const headings = document.querySelectorAll("h2");
831 | for (const heading of Array.from(headings)) {
832 | if (
833 | heading.innerHTML ===
834 | "Sorry, this page isn't available."
835 | ) {
836 | return false;
837 | }
838 | }
839 | return true;
840 | });
841 | if (!pageLoaded) {
842 | await this.handleConstructionError(
843 | "Page loaded with no content",
844 | 10,
845 | );
846 | return false;
847 | }
848 |
849 | // Run defaultPagePlugins
850 | for (const f of this.defaultPageFunctions) {
851 | await this.page.evaluate(f);
852 | }
853 |
854 | // Fix issue with disabled scrolling
855 | /* istanbul ignore next */
856 | await this.page.evaluate(() => {
857 | setInterval(() => {
858 | try {
859 | document.body.style.overflow = "";
860 | } catch (error) {
861 | this.logger.error("Failed to update style", {error});
862 | }
863 | }, 10000);
864 | });
865 | } catch (e) {
866 | await this.handleConstructionError(e, 60);
867 | return false;
868 | }
869 | return true;
870 | }
871 |
872 | /***
873 | * Handle errors that occur during page construction
874 | */
875 | private async handleConstructionError(error: string, timeout: number) {
876 | // Log error and wait
877 | this.logger.error("Construction error", {error, url: this.url});
878 | await this.progress(Progress.ABORTED);
879 | await this.sleep(timeout);
880 |
881 | // Close existing attempt
882 | if (!this.page.isClosed()) {
883 | await this.page.close();
884 | }
885 | await this.browser.close();
886 | }
887 |
888 | /**
889 | * Pause and wait until resumed
890 | */
891 | private async waitResume() {
892 | // Pause for 200 milliseconds
893 | function f() {
894 | return new Promise((resolve) => {
895 | setTimeout(resolve, 200);
896 | });
897 | }
898 |
899 | // Pause until pause toggled
900 | while (this.paused === true) {
901 | await this.progress(Progress.PAUSED);
902 | await f();
903 | }
904 | }
905 |
906 | /**
907 | * Pop a post off the postBuffer (using locks). Returns null if no posts in buffer
908 | */
909 | private async postPop() {
910 | let post = null;
911 | await this.postBufferLock.acquireAsync();
912 | if (this.postBuffer.length > 0) {
913 | post = this.postBuffer.shift();
914 | }
915 | this.postBufferLock.release();
916 | return post;
917 | }
918 |
919 | /**
920 | * Print progress to stderr
921 | */
922 | private async progress(state: Progress) {
923 | // End if silent
924 | if (this.silent) {
925 | return;
926 | }
927 |
928 | // Lock
929 | await this.writeLock.acquireAsync();
930 |
931 | // Calculate total
932 | const total = this.total === 0 ? "Unlimited" : this.total;
933 |
934 | // Generate output string
935 | const idStr = chalk.bgYellow.black(` ${this.id} `);
936 | const totalStr = chalk.bgBlack(` Total: ${total} `);
937 | const stateStr = chalk.bgWhite.black(` State: ${state} `);
938 | const sleepStr = chalk.bgWhite.black(
939 | ` Sleeping: ${this.sleepRemaining} `,
940 | );
941 | const indexStr = chalk.bgWhite.black(` Scraped: ${this.index} `);
942 |
943 | this.logger.debug({
944 | id: this.id,
945 | index: this.index,
946 | sleepRemaining: this.sleepRemaining,
947 | state,
948 | total,
949 | });
950 |
951 | // Print output
952 | process.stderr.write(
953 | `\r${idStr}${totalStr}${stateStr}${sleepStr}${indexStr}\u001B[K`,
954 | );
955 |
956 | // Release
957 | this.writeLock.release();
958 | }
959 |
960 | /**
961 | * Add request to the request buffer
962 | */
963 | private async interceptRequest(req: Request) {
964 | await this.requestBufferLock.acquireAsync();
965 | this.requestBuffer.push(req);
966 | await this.requestBufferLock.release();
967 | }
968 |
969 | /**
970 | * Add the response to the response buffer
971 | */
972 | private async interceptResponse(res: Response) {
973 | await this.responseBufferLock.acquireAsync();
974 | this.responseBuffer.push(res);
975 | await this.responseBufferLock.release();
976 | }
977 |
978 | /**
979 | * Log failed requests
980 | */
981 | private async interceptFailure(req: Request) {
982 | this.logger.info("Failed request", {url: req.url()});
983 | await this.progress(Progress.ABORTED);
984 | }
985 |
986 | /**
987 | * Add post to buffer
988 | */
989 | private async addToPostBuffer(post: PostType) {
990 | await this.postBufferLock.acquireAsync();
991 | await this.validatePost(post);
992 | this.postBuffer.push(post);
993 | this.postBufferLock.release();
994 | }
995 |
996 | /**
997 | * Manipulate the page to stimulate a request
998 | */
999 | private async jump() {
1000 | await this.page.keyboard.press("PageUp");
1001 | const jumpSize = this.graft ? 1 : this.jumpSize;
1002 | for (let i = 0; i < jumpSize; i++) {
1003 | await this.page.keyboard.press("End");
1004 | }
1005 |
1006 | // Move mouse randomly
1007 | const width = this.page.viewport()["width"];
1008 | const height = this.page.viewport()["height"];
1009 | await this.page.mouse.move(
1010 | Math.round(width * Math.random()),
1011 | Math.round(height * Math.random()),
1012 | );
1013 |
1014 | ++this.jumps;
1015 | }
1016 |
1017 | /**
1018 | * Clear request and response buffers
1019 | */
1020 | private async initiateGraft() {
1021 | // Check if enabled
1022 | if (!this.enableGrafting) {
1023 | return;
1024 | }
1025 |
1026 | await this.progress(Progress.GRAFTING);
1027 |
1028 | this.executePlugins("grafting");
1029 |
1030 | // Enable grafting
1031 | this.graft = true;
1032 | }
1033 |
1034 | /**
1035 | * Read the posts that are pre-loaded on the page
1036 | */
1037 | private async scrapeDefaultPosts() {
1038 | // Get shortcodes from page
1039 | /* istanbul ignore next */
1040 | const shortCodes = await this.page.evaluate((url) => {
1041 | return Array.from(document.links)
1042 | .filter((link) => {
1043 | return (
1044 | link.href.startsWith(url) &&
1045 | link.href.split("/").length >= 2
1046 | );
1047 | })
1048 | .map((link) => {
1049 | const linkSplit = link.href.split("/");
1050 | return linkSplit[linkSplit.length - 2];
1051 | });
1052 | }, this.defaultPostURL);
1053 |
1054 | // Add postPage promises
1055 | for (const shortCode of shortCodes) {
1056 | if (this.index < this.total || this.total === 0) {
1057 | this.index++;
1058 | this.pagePromises.push(
1059 | this.postPage(shortCode, this.postPageRetries),
1060 | );
1061 | } else {
1062 | this.finish(FinishedReasons.TOTAL_REACHED_PAGE);
1063 | break;
1064 | }
1065 | }
1066 | }
1067 |
1068 | private addPlugins(plugins: IPlugin[]) {
1069 | if (!plugins) {
1070 | return;
1071 | }
1072 |
1073 | for (const plugin of plugins) {
1074 | for (const event of Object.keys(this.pluginFunctions)) {
1075 | const pluginEvent = plugin[event + "Event"];
1076 | if (pluginEvent) {
1077 | const context: IPluginContext = {
1078 | plugin,
1079 | state: this,
1080 | };
1081 |
1082 | this.pluginFunctions[event].push(pluginEvent.bind(context));
1083 | }
1084 | }
1085 | }
1086 | }
1087 |
1088 | private executePlugins(event: SyncPluginEventsType, ...args): void;
1089 | private executePlugins(
1090 | event: AsyncPluginEventsType,
1091 | ...args
1092 | ): Promise;
1093 | private executePlugins(event: PluginEventsType, ...args) {
1094 | if (event in SyncPluginEvents) {
1095 | for (const pluginFunction of this.pluginFunctions["construction"]) {
1096 | pluginFunction();
1097 | }
1098 | return;
1099 | }
1100 |
1101 | return Promise.all(
1102 | // @ts-ignore
1103 | this.pluginFunctions[event].map((cb) => cb(...args)),
1104 | );
1105 | }
1106 | }
1107 |
1108 | /**
1109 | * The states of progress that the API can be in. Used to output status.
1110 | */
1111 | enum Progress {
1112 | LAUNCHING = "Launching",
1113 | OPENING = "Navigating",
1114 | SCRAPING = "Scraping",
1115 | BRANCHING = "Branching",
1116 | GRAFTING = "Grafting",
1117 | CLOSING = "Closing",
1118 |
1119 | PAUSED = "Paused",
1120 | ABORTED = "Request aborted",
1121 | }
1122 |
1123 | /**
1124 | * Reasons why the collection finished
1125 | */
1126 | enum FinishedReasons {
1127 | // forceStop used
1128 | FORCED_STOP,
1129 |
1130 | // API response doesn't contain next page
1131 | API_FINISHED,
1132 |
1133 | // Total posts required have been collected from the API
1134 | TOTAL_REACHED_API,
1135 |
1136 | // Total posts required have been collected from the default posts
1137 | TOTAL_REACHED_PAGE,
1138 |
1139 | // No API response intercepted after interacting with page
1140 | NO_RESPONSE,
1141 |
1142 | // Index hasn't increased after interacting with page
1143 | NO_INCREMENT,
1144 | }
1145 |
--------------------------------------------------------------------------------
/src/api/postIdSet.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * A set of post ids used to detect duplicates
3 | */
4 | export class PostIdSet {
5 | private ids: Set = new Set();
6 |
7 | /**
8 | * Add a post id to the set.
9 | * @return true if the id was already in the set, false if not.
10 | */
11 | public add(id: string): boolean {
12 | const contains = this.ids.has(id);
13 | this.ids.add(id);
14 | return contains;
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/api/search.ts:
--------------------------------------------------------------------------------
1 | import * as t from "io-ts";
2 | import {excess} from "io-ts-excess";
3 | import {IPlugin} from "../../plugins";
4 | import {IOptions} from "./api";
5 | import {Instagram} from "./instagram";
6 |
7 | export const Users = t.type({
8 | position: t.number,
9 | user: excess(
10 | t.type({
11 | full_name: t.string,
12 | account_badges: t.array(t.undefined),
13 | biography_product_mentions: t.array(t.undefined),
14 | has_anonymous_profile_picture: t.boolean,
15 | is_private: t.boolean,
16 | is_verified: t.boolean,
17 | latest_reel_media: t.number,
18 | mutual_followers_count: t.number,
19 | pk: t.string,
20 | profile_pic_id: t.union([t.string, t.undefined]),
21 | profile_pic_url: t.string,
22 | username: t.string,
23 | }),
24 | ),
25 | });
26 |
27 | export const Places = t.type({
28 | place: excess(
29 | t.type({
30 | header_media: t.any,
31 | location: excess(
32 | t.type({
33 | address: t.string,
34 | city: t.string,
35 | external_source: t.string,
36 | facebook_places_id: t.number,
37 | lat: t.union([t.undefined, t.number]),
38 | lng: t.union([t.undefined, t.number]),
39 | name: t.string,
40 | pk: t.string,
41 | short_name: t.string,
42 | }),
43 | ),
44 | media_bundles: t.UnknownArray,
45 | slug: t.string,
46 | subtitle: t.string,
47 | title: t.string,
48 | }),
49 | ),
50 | position: t.number,
51 | });
52 |
53 | export const Hashtags = t.type({
54 | hashtag: excess(
55 | t.type({
56 | id: t.string,
57 | media_count: t.number,
58 | name: t.string,
59 | profile_pic_url: t.string,
60 | search_result_subtitle: t.string,
61 | use_default_avatar: t.boolean,
62 | }),
63 | ),
64 | position: t.number,
65 | });
66 |
67 | export const SearchResult = t.type({
68 | clear_client_cache: t.boolean,
69 | has_more: t.boolean,
70 | hashtags: t.array(Hashtags),
71 | places: t.array(Places),
72 | rank_token: t.string,
73 | status: t.string,
74 | users: t.array(Users),
75 | });
76 |
77 | export type TSearchResult = t.TypeOf;
78 |
79 | export type ISearchOptions = Pick<
80 | IOptions,
81 | Exclude<
82 | keyof IOptions,
83 | "total" | "fullAPI" | "hibernationTime" | "sleepTime"
84 | >
85 | >;
86 |
87 | export interface ISearchOptionsPlugins extends ISearchOptions {
88 | plugins?: IPlugin[];
89 | }
90 |
91 | export class Search extends Instagram {
92 | public readonly catchURL = "https://www.instagram.com/web/";
93 | private searchResult: TSearchResult;
94 | private readonly searchQuery: string;
95 | private readonly inputElementQuery: string = "input[type='text']";
96 |
97 | constructor(query: string, options: ISearchOptions = {}) {
98 | super(
99 | "https://instagram.com/explore/tags/instagram",
100 | "",
101 | "",
102 | "",
103 | options,
104 | SearchResult,
105 | );
106 | this.searchQuery = query;
107 | }
108 |
109 | public async get() {
110 | if (!this.started) {
111 | await this.start();
112 | }
113 | try {
114 | await this.page.waitForSelector(this.inputElementQuery, {
115 | timeout: 30000,
116 | });
117 | } catch {
118 | // Timeout
119 | }
120 | await this.page.click(this.inputElementQuery);
121 |
122 | await this.page.keyboard.sendCharacter(this.searchQuery);
123 | await this.page.waitForRequest((req) => this.matchURL(req.url()));
124 | await this.processRequests();
125 | await this.page.waitForResponse((res) => this.matchURL(res.url()));
126 | await this.processResponses();
127 | await this.stop();
128 | return this.searchResult;
129 | }
130 |
131 | public matchURL(url: string) {
132 | return url.startsWith(this.catchURL);
133 | }
134 |
135 | protected async processResponseData(data: TSearchResult) {
136 | await this.validatePost(data);
137 | this.searchResult = data;
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/api/types.ts:
--------------------------------------------------------------------------------
1 | // tslint:disable: object-literal-sort-keys
2 | import * as t from "io-ts";
3 | import {excess} from "io-ts-excess";
4 |
5 | export const Location = t.type({
6 | id: t.string,
7 | has_public_page: t.boolean,
8 | name: t.string,
9 | slug: t.string,
10 | address_json: t.union([t.string, t.undefined, t.null]),
11 | });
12 |
13 | export const PostNodeOwner = t.type({
14 | id: t.string,
15 | });
16 |
17 | export const CommentNodeOwner = t.type({
18 | id: t.string,
19 | is_verified: t.boolean,
20 | profile_pic_url: t.string,
21 | username: t.string,
22 | });
23 |
24 | export const ShortcodeMediaOwner = t.type({
25 | id: t.string,
26 | is_verified: t.boolean,
27 | profile_pic_url: t.string,
28 | username: t.string,
29 | blocked_by_viewer: t.boolean,
30 | followed_by_viewer: t.boolean,
31 | full_name: t.string,
32 | has_blocked_viewer: t.boolean,
33 | is_private: t.boolean,
34 | is_unpublished: t.boolean,
35 | requested_by_viewer: t.boolean,
36 | });
37 |
38 | export const PageInfo = t.type({
39 | has_next_page: t.boolean,
40 | end_cursor: t.union([t.string, t.null]),
41 | });
42 |
43 | export const Dimensions = t.type({
44 | height: t.number,
45 | width: t.number,
46 | });
47 |
48 | export const Counter = t.type({
49 | count: t.number,
50 | });
51 |
52 | export const GatingInfo = t.type({
53 | buttons: t.array(t.string),
54 | description: t.string,
55 | gating_type: t.string,
56 | title: t.string,
57 | });
58 |
59 | export const DisplayResources = t.array(
60 | t.type({
61 | src: t.string,
62 | config_width: t.number,
63 | config_height: t.number,
64 | }),
65 | );
66 |
67 | export const EdgeMediaToCaptionNode = t.type({
68 | text: t.union([t.string, t.undefined]),
69 | shortcode: t.union([t.string, t.undefined]),
70 | is_video: t.union([t.boolean, t.undefined]),
71 | video_url: t.union([t.string, t.undefined]),
72 | display_resources: t.union([DisplayResources, t.undefined]),
73 | });
74 |
75 | export const EdgeMediaToCaption = t.type({
76 | edges: t.array(
77 | t.type({
78 | node: EdgeMediaToCaptionNode,
79 | }),
80 | ),
81 | });
82 |
83 | export const RelatedProfile = t.type({
84 | id: t.string,
85 | full_name: t.string,
86 | is_private: t.boolean,
87 | is_verified: t.boolean,
88 | profile_pic_url: t.string,
89 | username: t.string,
90 | edge_followed_by: t.type({
91 | count: t.number,
92 | }),
93 | edge_owner_to_timeline_media: t.type({
94 | count: t.number,
95 | edges: t.array(
96 | t.type({
97 | node: t.type({
98 | __typename: t.string,
99 | id: t.string,
100 | shortcode: t.string,
101 | edge_media_preview_like: Counter,
102 | edge_media_preview_comment: Counter,
103 | thumbnail_src: t.string,
104 | owner: t.type({
105 | id: t.string,
106 | username: t.string,
107 | }),
108 | gating_info: t.union([GatingInfo, t.null, t.undefined]),
109 | is_video: t.boolean,
110 | accessibility_caption: t.union([t.string, t.null]),
111 | }),
112 | }),
113 | ),
114 | }),
115 | });
116 |
117 | export const EdgeRelatedProfiles = t.type({
118 | edges: t.array(
119 | t.type({
120 | node: t.union([t.undefined, RelatedProfile]),
121 | }),
122 | ),
123 | });
124 |
125 | const EdgeSidecarToChildren = t.type({
126 | edges: t.array(
127 | t.type({
128 | node: t.type({
129 | __typename: t.string,
130 | id: t.string,
131 | shortcode: t.union([t.string, t.undefined]),
132 | dimensions: Dimensions,
133 | gating_info: t.union([t.null, t.undefined]),
134 | fact_check_information: t.union([t.null, t.undefined]),
135 | media_preview: t.union([t.undefined, t.string, t.null]),
136 | display_url: t.string,
137 | display_resources: DisplayResources,
138 | accessibility_caption: t.union([t.string, t.undefined, t.null]),
139 | is_video: t.boolean,
140 | video_url: t.union([t.string, t.undefined]),
141 | tracking_token: t.string,
142 | edge_media_to_tagged_user: EdgeMediaToCaption,
143 | }),
144 | }),
145 | ),
146 | });
147 |
148 | export const PostNode = t.type({
149 | __typename: t.union([t.string, t.undefined]),
150 | comments_disabled: t.boolean,
151 | location: t.union([t.null, t.undefined, Location]),
152 | id: t.string,
153 | edge_media_to_caption: EdgeMediaToCaption,
154 | shortcode: t.string,
155 | edge_media_to_comment: Counter,
156 | taken_at_timestamp: t.number,
157 | sensitivity_friction_info: t.union([GatingInfo, t.null, t.undefined]),
158 | media_overlay_info: t.union([t.null, t.undefined]),
159 | fact_check_information: t.union([t.null, t.undefined]),
160 | fact_check_overall_rating: t.union([t.undefined, t.null]),
161 | dimensions: Dimensions,
162 | display_url: t.string,
163 | edge_liked_by: t.union([Counter, t.undefined]),
164 | edge_media_preview_like: Counter,
165 | owner: PostNodeOwner,
166 | thumbnail_src: t.string,
167 | thumbnail_resources: t.union([DisplayResources, t.undefined]),
168 | is_video: t.boolean,
169 | accessibility_caption: t.union([t.string, t.undefined, t.null]),
170 | display_resources: t.union([DisplayResources, t.undefined]),
171 | should_log_client_event: t.union([t.undefined, t.boolean]),
172 | tracking_token: t.union([t.undefined, t.string]),
173 | edge_media_to_tagged_user: t.union([t.undefined, EdgeMediaToCaption]),
174 | edge_media_to_sponsor_user: t.union([t.undefined, EdgeMediaToCaption]),
175 | dash_info: t.union([
176 | t.undefined,
177 | t.type({
178 | is_dash_eligible: t.boolean,
179 | video_dash_manifest: t.null,
180 | number_of_qualities: t.number,
181 | }),
182 | ]),
183 | video_url: t.union([t.undefined, t.string]),
184 | video_view_count: t.union([t.undefined, t.number]),
185 | gating_info: t.union([t.null, t.undefined]),
186 | media_preview: t.union([t.undefined, t.string, t.null]),
187 | product_type: t.union([t.undefined, t.string]),
188 | viewer_has_liked: t.union([t.undefined, t.boolean]),
189 | viewer_has_saved: t.union([t.boolean, t.undefined]),
190 | viewer_has_saved_to_collection: t.union([t.boolean, t.undefined]),
191 | viewer_in_photo_of_you: t.union([t.boolean, t.undefined]),
192 | viewer_can_reshare: t.union([t.boolean, t.undefined]),
193 | edge_sidecar_to_children: t.union([EdgeSidecarToChildren, t.undefined]),
194 | });
195 |
196 | export const CommentNode = t.type({
197 | id: t.string,
198 | text: t.string,
199 | created_at: t.number,
200 | did_report_as_spam: t.boolean,
201 | owner: CommentNodeOwner,
202 | viewer_has_liked: t.boolean,
203 | edge_liked_by: Counter,
204 | });
205 |
206 | export const EdgeMediaPreviewComment = t.type({
207 | count: t.number,
208 | edges: t.array(
209 | t.type({
210 | node: CommentNode,
211 | }),
212 | ),
213 | });
214 |
215 | export const EdgeMediaHoistedComment = t.type({
216 | edges: t.array(
217 | t.type({
218 | node: CommentNode,
219 | }),
220 | ),
221 | });
222 |
223 | const EdgeMediaToParentCommentNode = t.intersection([
224 | CommentNode,
225 | t.type({
226 | edge_threaded_comments: t.type({
227 | count: t.number,
228 | page_info: PageInfo,
229 | edges: t.array(
230 | t.type({
231 | node: CommentNode,
232 | }),
233 | ),
234 | }),
235 | }),
236 | ]);
237 |
238 | export const Post = t.type({
239 | node: excess(PostNode),
240 | });
241 |
242 | export const EdgeMediaToParentComment = t.type({
243 | count: t.number,
244 | page_info: PageInfo,
245 | edges: t.array(
246 | t.type({
247 | node: EdgeMediaToParentCommentNode,
248 | }),
249 | ),
250 | });
251 |
252 | export const ShortcodeMedia = t.type({
253 | __typename: t.string,
254 | id: t.string,
255 | shortcode: t.string,
256 | edge_media_to_comment: t.union([Counter, t.undefined]),
257 | thumbnail_src: t.union([t.undefined, t.string]),
258 | dimensions: Dimensions,
259 | gating_info: t.union([GatingInfo, t.null, t.undefined]),
260 | sensitivity_friction_info: t.union([GatingInfo, t.null, t.undefined]),
261 | fact_check_information: t.null,
262 | fact_check_overall_rating: t.union([t.undefined, t.null]),
263 | media_overlay_info: t.null,
264 | media_preview: t.union([t.string, t.null]),
265 | display_url: t.string,
266 | display_resources: DisplayResources,
267 | accessibility_caption: t.union([t.string, t.undefined, t.null]),
268 | is_video: t.boolean,
269 | should_log_client_event: t.union([t.boolean, t.undefined]),
270 | tracking_token: t.string,
271 | edge_media_to_tagged_user: EdgeMediaToCaption,
272 | edge_media_to_caption: EdgeMediaToCaption,
273 | caption_is_edited: t.boolean,
274 | has_ranked_comments: t.boolean,
275 | has_audio: t.union([t.boolean, t.undefined]),
276 | edge_media_to_parent_comment: t.union([
277 | EdgeMediaToParentComment,
278 | t.undefined,
279 | ]),
280 | edge_media_to_hoisted_comment: t.union([
281 | EdgeMediaHoistedComment,
282 | t.undefined,
283 | ]),
284 | edge_media_preview_comment: t.union([EdgeMediaPreviewComment, t.undefined]),
285 | edge_related_profiles: EdgeRelatedProfiles,
286 | comments_disabled: t.boolean,
287 | commenting_disabled_for_viewer: t.boolean,
288 | clips_music_attribution_info: t.union([t.null, t.undefined]),
289 | taken_at_timestamp: t.number,
290 | edge_media_preview_like: EdgeMediaPreviewComment,
291 | edge_media_to_sponsor_user: EdgeMediaToCaption,
292 | location: t.union([t.string, t.null]),
293 | viewer_has_liked: t.boolean,
294 | viewer_has_saved: t.boolean,
295 | viewer_has_saved_to_collection: t.boolean,
296 | viewer_in_photo_of_you: t.boolean,
297 | viewer_can_reshare: t.boolean,
298 | owner: ShortcodeMediaOwner,
299 | is_ad: t.boolean,
300 | edge_web_media_to_related_media: EdgeMediaToCaption,
301 | edge_sidecar_to_children: t.union([EdgeSidecarToChildren, t.undefined]),
302 | dash_info: t.union([
303 | t.undefined,
304 | t.type({
305 | is_dash_eligible: t.boolean,
306 | video_dash_manifest: t.null,
307 | number_of_qualities: t.number,
308 | }),
309 | ]),
310 | video_url: t.union([t.undefined, t.string]),
311 | video_view_count: t.union([t.undefined, t.number]),
312 | video_play_count: t.union([t.undefined, t.null, t.number]),
313 | encoding_status: t.union([t.undefined, t.string, t.null]),
314 | is_published: t.union([t.undefined, t.boolean]),
315 | product_type: t.union([t.undefined, t.string]),
316 | title: t.union([t.undefined, t.string, t.null]),
317 | video_duration: t.union([t.undefined, t.number]),
318 | });
319 |
320 | export const SinglePost = t.type({
321 | shortcode_media: excess(ShortcodeMedia),
322 | });
323 |
324 | export const FullApiPost = t.type({
325 | shortcode_media: excess(
326 | t.type({
327 | ...ShortcodeMedia.props,
328 | location: t.union([Location, t.null]),
329 | }),
330 | ),
331 | });
332 |
333 | // tslint:enable: object-literal-sort-keys
334 |
335 | export type TPost = t.TypeOf;
336 |
337 | export type TSinglePost = t.TypeOf;
338 |
339 | export type TFullApiPost = t.TypeOf;
340 |
--------------------------------------------------------------------------------
/src/cli.ts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | import * as aws from "aws-sdk";
4 | import * as fs from "fs";
5 | import * as readline from "readline";
6 | import * as winston from "winston";
7 |
8 | import * as path from "path";
9 | import {v4 as uuid} from "uuid";
10 | import * as plugins from "../plugins";
11 | import {createApi, IOptions} from "./api/api";
12 | import {TFullApiPost, TPost} from "./api/types";
13 | import {GetPool} from "./getpool/getPool";
14 | import * as depotUpload from "./http/depot";
15 | import {download, toCSV, toJSON} from "./http/download";
16 | import * as s3Upload from "./http/s3";
17 |
18 | const getLogger = (args) => {
19 | const transports = [];
20 | if (args["logging"] !== "none") {
21 | transports.push(
22 | new winston.transports.File({
23 | filename: args["logfile"],
24 | level: args["logging"],
25 | silent: args["logging"] === "none",
26 | }),
27 | );
28 | }
29 | return winston.createLogger({
30 | level: args["logging"],
31 | silent: args["logging"] === "none",
32 | transports,
33 | });
34 | };
35 |
36 | function getOptions(args, logger) {
37 | const options: IOptions = {
38 | enableGrafting: args["graft"],
39 | executablePath: args["browser"],
40 | fullAPI: args["full"],
41 | headless: !args["visible"],
42 | logger,
43 | plugins: [],
44 | sameBrowser: args["sameBrowser"],
45 | silent: args["quiet"],
46 | sleepTime: args["sleep"],
47 | strict: args["strict"],
48 | total: args["count"],
49 | };
50 |
51 | for (const pluginName of args["plugin"]) {
52 | if (plugins.plugins[pluginName]) {
53 | options.plugins.push(new plugins.plugins[pluginName]());
54 | } else {
55 | throw new Error("Couldn't find plugin " + pluginName);
56 | }
57 | }
58 | return options;
59 | }
60 |
61 | /**
62 | * Build argument parser
63 | */
64 | function buildParser(args, callback) {
65 | /* tslint:disable:no-unused-expression */
66 | require("yargs")(args)
67 | .usage("Usage: $0 [options]")
68 | .command("hashtag [id]", "Scrape a hashtag", {}, async (handleArgs) => {
69 | await spawn(handleArgs);
70 | callback();
71 | })
72 | .command(
73 | "user [id]",
74 | "Scrape a users posts",
75 | {},
76 | async (handleArgs) => {
77 | await spawn(handleArgs);
78 | callback();
79 | },
80 | )
81 | .command(
82 | "post [ids]",
83 | "Scrape a comma-separated list of posts",
84 | {},
85 | async (handleArgs) => {
86 | await spawn(handleArgs);
87 | callback();
88 | },
89 | )
90 | .command(
91 | "search [query]",
92 | "Perform a search of users, tags and places",
93 | {},
94 | async (handleArgs) => {
95 | const logger = getLogger(handleArgs);
96 | const options = getOptions(handleArgs, logger);
97 | if (!handleArgs["query"]) {
98 | throw new Error("query required");
99 | }
100 | const search = createApi(
101 | "search",
102 | handleArgs["query"],
103 | options,
104 | );
105 | const result = await search.get();
106 | process.stdout.write("\n");
107 | process.stdout.write(JSON.stringify(result, null, 2));
108 | process.stdout.write("\n");
109 | callback();
110 | },
111 | )
112 | .command(
113 | "batch [batchfile]",
114 | "Read newline-separated arguments from a file",
115 | {},
116 | () => {
117 | // A list of functions which create new Promises that are
118 | // resolved by buildParser when the spawn commands are
119 | // finished
120 | // See https://stackoverflow.com/a/45951080/7435520
121 | const functions = [];
122 |
123 | // Read the list of commands from file
124 | readline
125 | .createInterface({
126 | crlfDelay: Infinity,
127 | input: fs.createReadStream(args[1]),
128 | })
129 | .on(
130 | "line",
131 | // For each line, create a new function which
132 | // creates a new promise to be resolved by
133 | // buildParser
134 | (line) => {
135 | if (line.length > 0 && line.charAt(0) !== "#") {
136 | functions.push(
137 | () =>
138 | new Promise((res) =>
139 | buildParser(line, res),
140 | ),
141 | );
142 | }
143 | },
144 | )
145 | .on(
146 | "close",
147 | // When all lines have been read, synchronously
148 | // execute the commands by waiting for their
149 | // promises to be resolved
150 | async () => {
151 | for (const f of functions) {
152 | await f();
153 | }
154 | process.exit();
155 | },
156 | );
157 | },
158 | )
159 | /* tslint:disable:object-literal-sort-keys */
160 | .options({
161 | count: {
162 | alias: "c",
163 | number: true,
164 | default: 0,
165 | describe: "Number of posts to download (0 for all)",
166 | group: "Configuration",
167 | },
168 | full: {
169 | alias: ["f"],
170 | boolean: true,
171 | default: false,
172 | describe: "Retrieve full post data",
173 | group: "Configuration",
174 | },
175 | sleep: {
176 | alias: ["s"],
177 | number: true,
178 | default: 2,
179 | describe: "Seconds to sleep between interactions",
180 | group: "Configuration",
181 | },
182 | graft: {
183 | alias: "g",
184 | boolean: true,
185 | default: true,
186 | describe: "Enable grafting",
187 | group: "Configuration",
188 | },
189 | browser: {
190 | alias: ["b"],
191 | string: true,
192 | default: undefined,
193 | describe: "Browser path. Defaults to the puppeteer version",
194 | group: "Configuration",
195 | },
196 | sameBrowser: {
197 | boolean: true,
198 | default: false,
199 | describe: "Use a single browser when grafting",
200 | group: "Configuration",
201 | },
202 | download: {
203 | alias: "d",
204 | boolean: true,
205 | default: false,
206 | describe: "Save images from posts",
207 | group: "Download",
208 | },
209 | downdir: {
210 | default: "downloads/[endpoint]/[id]",
211 | describe: "Download path",
212 | group: "Download",
213 | },
214 | video: {
215 | alias: "v",
216 | boolean: true,
217 | default: false,
218 | describe: "Download videos (requires full)",
219 | implies: "full",
220 | group: "Download",
221 | },
222 | sync: {
223 | boolean: true,
224 | default: false,
225 | describe: "Force download between requests",
226 | group: "Download",
227 | },
228 | threads: {
229 | alias: "k",
230 | number: true,
231 | default: 4,
232 | describe: "Parallel download / depot threads",
233 | group: "Download",
234 | },
235 | waitDownload: {
236 | alias: "w",
237 | boolean: true,
238 | default: false,
239 | describe: "Download media after scraping",
240 | group: "Download",
241 | },
242 | bucket: {
243 | string: true,
244 | default: undefined,
245 | describe: "Upload files to an AWS S3 bucket",
246 | group: "Upload",
247 | },
248 | depot: {
249 | string: true,
250 | default: undefined,
251 | describe: "Upload files to a URL with a PUT request (depot)",
252 | group: "Upload",
253 | },
254 | file: {
255 | alias: ["o"],
256 | string: true,
257 | default: "[id]",
258 | describe: "Output filename. '-' for stdout",
259 | group: "Output",
260 | },
261 | type: {
262 | alias: ["t"],
263 | default: "json",
264 | describe: "Filetype",
265 | choices: ["csv", "json", "both"],
266 | group: "Output",
267 | },
268 | mediaPath: {
269 | alias: ["m"],
270 | boolean: true,
271 | default: false,
272 | describe: "Add filepaths to _mediaPath",
273 | group: "Output",
274 | },
275 | visible: {
276 | boolean: true,
277 | default: false,
278 | describe: "Show browser on the screen",
279 | group: "Display",
280 | },
281 | quiet: {
282 | alias: ["q"],
283 | boolean: true,
284 | default: false,
285 | describe: "Disable progress output",
286 | group: "Display",
287 | },
288 | logging: {
289 | alias: ["l"],
290 | default: "none",
291 | choices: ["none", "error", "info", "debug"],
292 | group: "Logging",
293 | },
294 | logfile: {
295 | string: true,
296 | default: "instamancer.log",
297 | describe: "Log file name",
298 | group: "Logging",
299 | },
300 | strict: {
301 | boolean: true,
302 | default: false,
303 | describe: "Throw an error on response type mismatch",
304 | group: "Validation",
305 | },
306 | plugin: {
307 | alias: ["p"],
308 | array: true,
309 | default: [],
310 | describe: "Use a plugin from the plugins directory",
311 | group: "Plugins",
312 | },
313 | })
314 | .demandCommand()
315 | .example(
316 | "$0 hashtag instagood -fvd",
317 | "Download all the available posts, and their media from #instagood",
318 | )
319 | .example(
320 | "$0 user arianagrande --type=csv --logging=info --visible",
321 | "Download Ariana Grande's posts to a CSV file with a non-headless browser, and log all events",
322 | )
323 | .epilog(
324 | "Source code available at https://github.com/ScriptSmith/instamancer",
325 | )
326 | .strict().argv;
327 | /* tslint:enable:no-unused-expression */
328 | }
329 |
330 | /**
331 | * Spawn an instance of the API
332 | * @param args
333 | */
334 | async function spawn(args) {
335 | // Initiate logger
336 | const logger = getLogger(args);
337 |
338 | // Check id
339 | if (!(args["id"] || args["ids"])) {
340 | throw new Error("Id required");
341 | }
342 |
343 | // Pick endpoint
344 | let ids;
345 | if (args["_"][0] === "post") {
346 | ids = args["ids"].split(",");
347 | args["id"] = ids.length === 1 ? ids[0] : "posts";
348 | args["full"] = true;
349 | } else {
350 | ids = args["id"];
351 | }
352 |
353 | // Define options
354 | const options: IOptions = getOptions(args, logger);
355 |
356 | // Replace downdir
357 | const downdir = args["downdir"]
358 | .replace("[id]", args["id"])
359 | .replace("[endpoint]", args["_"]);
360 |
361 | // Replace depot url
362 | let depotUrl = args["depot"];
363 | if (depotUrl && depotUrl.includes("[uuid]")) {
364 | depotUrl = depotUrl.replace("[uuid]", uuid());
365 | if (!args["quiet"]) {
366 | process.stdout.write(depotUrl + "\n");
367 | }
368 | }
369 |
370 | // Get s3 bucket
371 | const s3Bucket = args["bucket"];
372 |
373 | // Check if outputting to stdout
374 | const printOutput = args["file"] === "-";
375 |
376 | // Connect to object storage
377 | let downloadUpload;
378 | let toCSVFunc = toCSV;
379 | let toJSONFunc = toJSON;
380 | if (depotUrl) {
381 | // Depot
382 | const depotConfig = {
383 | directory: downdir,
384 | url: depotUrl,
385 | logger,
386 | };
387 |
388 | downloadUpload = depotUpload.depot.bind(depotConfig);
389 | toCSVFunc = depotUpload.toCSV.bind(depotConfig);
390 | toJSONFunc = depotUpload.toJSON.bind(depotConfig);
391 | } else if (s3Bucket) {
392 | // s3
393 | const s3Config = {
394 | bucket: s3Bucket,
395 | directory: downdir,
396 | s3: new aws.S3(),
397 | logger,
398 | };
399 |
400 | downloadUpload = s3Upload.s3.bind(s3Config);
401 | toCSVFunc = s3Upload.toCSV.bind(s3Config);
402 | toJSONFunc = s3Upload.toJSON.bind(s3Config);
403 | } else {
404 | // Download
405 | downloadUpload = download.bind({
406 | directory: downdir,
407 | logger,
408 | });
409 | }
410 |
411 | // Start API
412 | logger.info("Starting API at " + Date.now());
413 | const obj = createApi(args["_"][0], ids, options);
414 | await obj.start();
415 |
416 | // Start download pool
417 | const getPool = new GetPool(args["threads"], downloadUpload);
418 |
419 | // Pick between synchronous and parallel downloads
420 | const downloadFunction = args["sync"]
421 | ? downloadUpload
422 | : getPool.add.bind(getPool);
423 |
424 | // Add pause callback
425 | function handleKeypress(str, key) {
426 | if (key.name === "space") {
427 | obj.pause();
428 | } else if (key.name === "c" && key.ctrl) {
429 | process.stdout.write("\n");
430 | process.kill(process.pid, "SIGINT");
431 | }
432 | }
433 |
434 | process.stdin.on("keypress", handleKeypress);
435 |
436 | // Array of urls and filenames
437 | let downloadMedia: [string, string, FILETYPES][] = [];
438 |
439 | // Download posts
440 | const posts = [];
441 | for await (const post of obj.generator()) {
442 | // Add _mediaPath key
443 | if (args["mediaPath"]) {
444 | post["_mediaPath"] = [];
445 | }
446 |
447 | // Identify download urls
448 | if (args["download"] && ("node" in post || "shortcode_media" in post)) {
449 | // Check the scraping level
450 | if (args["full"]) {
451 | // Check if album
452 | const postObject = post as TFullApiPost;
453 | const children =
454 | postObject.shortcode_media.edge_sidecar_to_children;
455 | if (children !== undefined) {
456 | for (const child of children.edges) {
457 | const shortcode = child.node.shortcode;
458 |
459 | // Check if video
460 | let mediaUrl: string;
461 | let mediaType: FILETYPES;
462 | if (child.node.is_video && args["video"]) {
463 | mediaUrl = child.node.video_url;
464 | mediaType = FILETYPES.VIDEO;
465 | } else {
466 | mediaUrl = child.node.display_resources.pop().src;
467 | mediaType = FILETYPES.IMAGE;
468 | }
469 | saveMediaMetadata(
470 | post,
471 | args,
472 | downloadMedia,
473 | downdir,
474 | mediaUrl,
475 | shortcode,
476 | mediaType,
477 | );
478 | }
479 | } else {
480 | const shortcode = postObject.shortcode_media.shortcode;
481 |
482 | // Check if video
483 | let mediaUrl: string;
484 | let mediaType: FILETYPES;
485 | if (postObject.shortcode_media.is_video && args["video"]) {
486 | mediaUrl = postObject.shortcode_media.video_url;
487 | mediaType = FILETYPES.VIDEO;
488 | } else {
489 | mediaUrl = postObject.shortcode_media.display_resources.pop()
490 | .src;
491 | mediaType = FILETYPES.IMAGE;
492 | }
493 | saveMediaMetadata(
494 | post,
495 | args,
496 | downloadMedia,
497 | downdir,
498 | mediaUrl,
499 | shortcode,
500 | mediaType,
501 | );
502 | }
503 | } else {
504 | const postObject = post as TPost;
505 | saveMediaMetadata(
506 | post,
507 | args,
508 | downloadMedia,
509 | downdir,
510 | postObject.node.thumbnail_src,
511 | postObject.node.shortcode,
512 | FILETYPES.IMAGE,
513 | );
514 | }
515 | }
516 |
517 | // Output if required
518 | if (printOutput) {
519 | process.stdout.write(JSON.stringify(post, null, 2) + "\n");
520 | } else {
521 | posts.push(post);
522 | }
523 |
524 | // Download the identified media
525 | if (!args["waitDownload"]) {
526 | for (const asset of downloadMedia) {
527 | await downloadFunction(...asset);
528 | }
529 | downloadMedia = [];
530 | }
531 | }
532 |
533 | // Download remaining media
534 | for (const asset of downloadMedia) {
535 | await downloadFunction(...asset);
536 | }
537 |
538 | // Close download pool
539 | await new Promise((resolve) => {
540 | getPool.close(resolve);
541 | });
542 | await Promise.all(getPool.promises);
543 |
544 | // Replace filename
545 | const filename = args["file"]
546 | .replace("[id]", args["id"])
547 | .replace("[endpoint]", args["_"]);
548 |
549 | // Save file
550 | if (!printOutput) {
551 | if (args["type"] !== "json") {
552 | let saveFile = filename;
553 | if (args["type"] === "both" || args["file"] === "[id]") {
554 | saveFile += ".csv";
555 | }
556 | await toCSVFunc(posts, saveFile);
557 | }
558 | if (args["type"] !== "csv") {
559 | let saveFile = filename;
560 | if (args["type"] === "both" || args["file"] === "[id]") {
561 | saveFile += ".json";
562 | }
563 | await toJSONFunc(posts, saveFile);
564 | }
565 | }
566 |
567 | // Remove pause callback
568 | process.stdin.removeAllListeners("keypress");
569 |
570 | // Close logger
571 | logger.close();
572 | }
573 |
574 | function saveMediaMetadata(
575 | post: object,
576 | args: object,
577 | downloadMedia: [string, string, FILETYPES][],
578 | downDir: string,
579 | url: string,
580 | shortcode: string,
581 | fileType: FILETYPES,
582 | ) {
583 | if (args["mediaPath"]) {
584 | let uri = path.join(downDir, shortcode + "." + fileType);
585 | uri = args["swift"] ? "swift://" + uri : uri;
586 | post["_mediaPath"].push(uri);
587 | }
588 | downloadMedia.push([url, shortcode, fileType]);
589 | }
590 |
591 | // Catch key presses
592 | readline.emitKeypressEvents(process.stdin);
593 | if ("setRawMode" in process.stdin) {
594 | process.stdin.setRawMode(true);
595 | }
596 |
597 | // Parse args
598 | buildParser(process.argv.slice(2), () => {
599 | process.exit(0);
600 | });
601 |
602 | enum FILETYPES {
603 | VIDEO = "mp4",
604 | IMAGE = "jpg",
605 | }
606 |
--------------------------------------------------------------------------------
/src/getpool/getPool.ts:
--------------------------------------------------------------------------------
1 | import * as winston from "winston";
2 |
3 | class GetJob {
4 | public finished: boolean = false;
5 | private readonly url: string;
6 | private readonly name: string;
7 | private readonly extension: string;
8 | private readonly downloadUpload: (
9 | url: string,
10 | name: string,
11 | extension: string,
12 | ) => Promise;
13 |
14 | constructor(url: string, name: string, extension: string, downloadUpload) {
15 | this.url = url;
16 | this.name = name;
17 | this.extension = extension;
18 | this.downloadUpload = downloadUpload;
19 | }
20 |
21 | public async start() {
22 | await this.downloadUpload(this.url, this.name, this.extension);
23 | this.finished = true;
24 | }
25 | }
26 |
27 | /**
28 | * A pool of jobs that only executes k jobs 'simultaneously'
29 | */
30 | export class GetPool {
31 | // Job promises
32 | public promises: Array> = [];
33 |
34 | // Jobs that are currently being executed
35 | private runningJobs: GetJob[] = [];
36 |
37 | // Jobs that are yet to be executed
38 | private queuedJobs: GetJob[] = [];
39 |
40 | // Maximum number of jobs to be executed simultaneously
41 | private readonly maxConnections: number;
42 |
43 | // Looping interval executing promises
44 | private readonly loop;
45 |
46 | // Lock loop function execution
47 | private lock: boolean = false;
48 |
49 | // End-of-input signal triggered externally by close()
50 | private finished: boolean = false;
51 |
52 | // End-of-input resolve function
53 | private resolve: () => {};
54 |
55 | // Download / Upload function
56 | private readonly downloadUpload: (
57 | url: string,
58 | name: string,
59 | extension: string,
60 | directory: string,
61 | logger: winston.Logger,
62 | ) => Promise;
63 |
64 | constructor(
65 | connections: number = 1,
66 | downloadUpload: (
67 | url: string,
68 | name: string,
69 | extension: string,
70 | ) => Promise,
71 | ) {
72 | this.maxConnections = connections;
73 | this.loop = setInterval(() => {
74 | this.poolLoop.bind(this)();
75 | }, 100);
76 | this.downloadUpload = downloadUpload;
77 | }
78 |
79 | public add(url: string, name: string, extension: string) {
80 | this.queuedJobs.push(
81 | new GetJob(url, name, extension, this.downloadUpload),
82 | );
83 | }
84 |
85 | public close(resolve) {
86 | this.finished = true;
87 | this.resolve = resolve;
88 | }
89 |
90 | private poolLoop() {
91 | // Obtain lock or cancel
92 | if (this.lock) {
93 | return;
94 | } else {
95 | this.lock = true;
96 | }
97 |
98 | // Remove finished jobs
99 | for (let i = 0; i < this.runningJobs.length; i++) {
100 | if (this.runningJobs[i].finished) {
101 | this.runningJobs.splice(i);
102 | i = 0;
103 | }
104 | }
105 |
106 | // Add new jobs to empty running slots
107 | while (
108 | this.queuedJobs.length > 0 &&
109 | this.runningJobs.length < this.maxConnections
110 | ) {
111 | const job = this.queuedJobs.shift();
112 | this.promises.push(job.start());
113 | this.runningJobs.push(job);
114 | }
115 |
116 | // End the interval when end-of-input signal given
117 | if (
118 | this.finished &&
119 | this.queuedJobs.length === 0 &&
120 | this.runningJobs.length === 0
121 | ) {
122 | clearInterval(this.loop);
123 | this.resolve();
124 | }
125 |
126 | // Release lock
127 | this.lock = false;
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/http/depot.ts:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 | import * as fs from "fs";
3 | import * as path from "path";
4 | import * as tmp from "tmp";
5 | import {resolve, URL} from "url";
6 | import * as winston from "winston";
7 | import * as download from "./download";
8 |
9 | interface IUpload {
10 | url: string;
11 | directory: string;
12 | logger: winston.Logger;
13 | }
14 |
15 | export async function depot(
16 | this: IUpload,
17 | url: string,
18 | name: string,
19 | extension: string,
20 | ) {
21 | try {
22 | // Axios download
23 | const downloadStream = await axios({
24 | method: "GET",
25 | responseType: "stream",
26 | url,
27 | });
28 |
29 | // Extract headers
30 | const contentType = downloadStream.headers["content-type"];
31 | const contentLength = downloadStream.headers["content-length"];
32 |
33 | // Upload path
34 | const filePath = path.join(this.directory, name + "." + extension);
35 | const uploadUrl = resolve(this.url, filePath);
36 |
37 | // Axios depot
38 | await axios({
39 | data: downloadStream.data,
40 | headers: {
41 | "Content-Length": contentLength,
42 | "Content-Type": contentType,
43 | },
44 | maxContentLength: Infinity,
45 | method: "PUT",
46 | ...authURL(uploadUrl),
47 | }).catch((error) => {
48 | this.logger.error(`Uploading ${url} failed`, error);
49 | });
50 | } catch (e) {
51 | this.logger.error(`Uploading ${url} failed`, e);
52 | }
53 | }
54 |
55 | function authURL(
56 | url: string,
57 | ): {url: string; auth: {username: string; password: string}} {
58 | const components = new URL(url);
59 | const auth = {
60 | password: components.password,
61 | username: components.username,
62 | };
63 | components.username = "";
64 | components.password = "";
65 |
66 | return {
67 | auth,
68 | url: components.toString(),
69 | };
70 | }
71 |
72 | async function uploadFile(
73 | this: IUpload,
74 | posts: object[],
75 | filePath: string,
76 | fileFunc: (posts: object[], filePath: string) => Promise,
77 | contentType: string,
78 | ) {
79 | // Create tmp file
80 | const tmpFile = tmp.fileSync({keep: true});
81 |
82 | // Dump posts to file
83 | await fileFunc(posts, tmpFile.name);
84 |
85 | // Read file to a stream
86 | const fileStream = fs.createReadStream(tmpFile.name);
87 | const contentLength = fs.statSync(tmpFile.name).size;
88 |
89 | // Upload file
90 | const uploadUrl = resolve(this.url, filePath);
91 | await axios({
92 | data: fileStream,
93 | headers: {
94 | "Content-Length": contentLength,
95 | "Content-Type": contentType,
96 | },
97 | maxContentLength: Infinity,
98 | method: "PUT",
99 | url: uploadUrl,
100 | });
101 |
102 | // Delete file
103 | fs.unlinkSync(tmpFile.name);
104 | }
105 |
106 | /**
107 | * Upload list of posts to a CSV file
108 | */
109 | export async function toCSV(this: IUpload, posts: object[], filePath: string) {
110 | const uploader = uploadFile.bind(this);
111 | await uploader(posts, filePath, download.toCSV, "text/csv");
112 | }
113 |
114 | /**
115 | * Upload list of posts to a JSON file
116 | */
117 | export async function toJSON(this: IUpload, posts: object[], filePath: string) {
118 | const uploader = uploadFile.bind(this);
119 | await uploader(posts, filePath, download.toJSON, "text/json");
120 | }
121 |
--------------------------------------------------------------------------------
/src/http/download.ts:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 | import * as fs from "fs";
3 | import {Parser, transforms} from "json2csv";
4 | import * as winston from "winston";
5 |
6 | interface IDownload {
7 | directory: string;
8 | logger: winston.Logger;
9 | }
10 |
11 | /**
12 | * Download file
13 | * @param url The URL of the file
14 | * @param name The name used to identify the file
15 | * @param extension The file extension (eg. ".jpg" or ".mp4")
16 | */
17 | export async function download(
18 | this: IDownload,
19 | url: string,
20 | name: string,
21 | extension: string,
22 | ) {
23 | await new Promise((resolve) => {
24 | fs.mkdir(this.directory, {recursive: true}, resolve);
25 | });
26 | try {
27 | // Get data
28 | const response = await axios({
29 | method: "get",
30 | responseType: "stream",
31 | url,
32 | });
33 |
34 | // Write to file
35 | await new Promise(async (resolve) => {
36 | const stream = fs.createWriteStream(
37 | this.directory + "/" + name + "." + extension,
38 | );
39 | // noinspection TypeScriptValidateJSTypes
40 | response.data.pipe(stream);
41 | stream.on("finish", resolve);
42 | });
43 | } catch (e) {
44 | this.logger.info(`Downloading ${url} failed`);
45 | this.logger.debug(e);
46 | }
47 | }
48 |
49 | /**
50 | * Save list of posts to a CSV file
51 | */
52 | export async function toCSV(posts: object[], filePath: string) {
53 | const parser = new Parser({transforms: [transforms.flatten()]});
54 | const csv = parser.parse(posts);
55 | fs.writeFileSync(filePath, csv);
56 | }
57 |
58 | /**
59 | * Save list of posts to a JSON file
60 | */
61 | export async function toJSON(posts: object[], filePath: string) {
62 | let first = true;
63 | fs.writeFileSync(filePath, "[");
64 | for (const post of posts) {
65 | if (first) {
66 | first = false;
67 | } else {
68 | fs.appendFileSync(filePath, ", ");
69 | }
70 | fs.appendFileSync(filePath, JSON.stringify(post));
71 | }
72 | fs.appendFileSync(filePath, "]");
73 | }
74 |
--------------------------------------------------------------------------------
/src/http/s3.ts:
--------------------------------------------------------------------------------
1 | import * as aws from "aws-sdk";
2 | import axios from "axios";
3 | import * as fs from "fs";
4 | import * as tmp from "tmp";
5 | import * as winston from "winston";
6 | import * as download from "./download";
7 |
8 | interface IUpload {
9 | bucket: string;
10 | directory: string;
11 | s3: aws.S3;
12 | logger: winston.Logger;
13 | }
14 |
15 | export async function s3(
16 | this: IUpload,
17 | url: string,
18 | name: string,
19 | extension: string,
20 | ) {
21 | try {
22 | // Axios download
23 | const downloadStream = await axios({
24 | method: "GET",
25 | responseType: "stream",
26 | url,
27 | });
28 |
29 | // Extract headers
30 | const contentType = downloadStream.headers["content-type"];
31 | const contentLength = downloadStream.headers["content-length"];
32 |
33 | // s3 upload
34 | await new Promise((resolve) => {
35 | this.s3.upload(
36 | {
37 | Body: downloadStream.data,
38 | Bucket: this.bucket,
39 | ContentLength: contentLength,
40 | ContentType: contentType,
41 | Key: this.directory + "/" + name + "." + extension,
42 | },
43 | (err) => {
44 | if (err !== null) {
45 | this.logger.error(`Uploading ${url} failed`, err);
46 | }
47 | resolve();
48 | },
49 | );
50 | });
51 | } catch (e) {
52 | this.logger.error(`Uploading ${url} failed`, e);
53 | }
54 | }
55 |
56 | async function uploadFile(
57 | this: IUpload,
58 | posts: object[],
59 | filePath: string,
60 | fileFunc: (posts: object[], filePath: string) => Promise,
61 | contentType: string,
62 | ) {
63 | // Create tmp file
64 | const tmpFile = tmp.fileSync({keep: true});
65 |
66 | // Dump posts to file
67 | await fileFunc(posts, tmpFile.name);
68 |
69 | // Read file to a stream
70 | const fileStream = fs.createReadStream(tmpFile.name);
71 | const contentLength = fs.statSync(tmpFile.name).size;
72 |
73 | // s3 upload
74 | await new Promise((resolve) => {
75 | this.s3.upload(
76 | {
77 | Body: fileStream,
78 | Bucket: this.bucket,
79 | ContentLength: contentLength,
80 | ContentType: contentType,
81 | Key: filePath,
82 | },
83 | (err) => {
84 | if (err !== null) {
85 | this.logger.error(`Uploading ${filePath} failed`, err);
86 | }
87 | resolve();
88 | },
89 | );
90 | });
91 |
92 | // Delete file
93 | fs.unlinkSync(tmpFile.name);
94 | }
95 |
96 | /**
97 | * Upload list of posts to a CSV file
98 | */
99 | export async function toCSV(this: IUpload, posts: object[], filePath: string) {
100 | const uploader = uploadFile.bind(this);
101 | await uploader(posts, filePath, download.toCSV, "text/csv");
102 | }
103 |
104 | /**
105 | * Upload list of posts to a JSON file
106 | */
107 | export async function toJSON(this: IUpload, posts: object[], filePath: string) {
108 | const uploader = uploadFile.bind(this);
109 | await uploader(posts, filePath, download.toJSON, "text/json");
110 | }
111 |
--------------------------------------------------------------------------------
/tests/__fixtures__/FakePage.ts:
--------------------------------------------------------------------------------
1 | import * as t from "io-ts";
2 | import {IOptions} from "../../src/api/api";
3 | import {Instagram} from "../../src/api/instagram";
4 |
5 | export interface IFakePageOptions {
6 | // The path on the server
7 | path?: string;
8 |
9 | // The port the server is hosted on
10 | port?: number;
11 |
12 | // The query to get API pages
13 | pageQuery?: string;
14 |
15 | // The query to get posts
16 | edgeQuery?: string;
17 |
18 | // The page to catch api requests on
19 | catchPage?: string;
20 |
21 | // The page to visit posts
22 | postPage?: string;
23 |
24 | // Regular API options
25 | options?: IOptions;
26 | }
27 |
28 | const FakeValidator = t.type({
29 | node: t.type({
30 | id: t.string,
31 | }),
32 | });
33 |
34 | export class FakePage extends Instagram> {
35 | constructor(options: IFakePageOptions = {path: "", port: 0}) {
36 | let baseURL = "http://127.0.0.1:" + options.port;
37 | if (options.path) {
38 | baseURL += options.path;
39 | }
40 |
41 | const silentOptions: IOptions = {silent: true};
42 | super(
43 | baseURL,
44 | "",
45 | options.pageQuery,
46 | options.edgeQuery,
47 | {
48 | ...options.options,
49 | ...silentOptions,
50 | },
51 | FakeValidator,
52 | );
53 |
54 | this.catchURL = baseURL + "/" + options.catchPage;
55 | this.postURL = baseURL + "/" + options.postPage;
56 |
57 | setTimeout(async () => {
58 | await this.forceStop();
59 | }, 30000);
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/tests/__fixtures__/QuickGraft.ts:
--------------------------------------------------------------------------------
1 | import {Hashtag, IOptions} from "../../src/api/api";
2 |
3 | export class QuickGraft extends Hashtag<{}> {
4 | constructor(id: string, options: IOptions = {}) {
5 | super(id, options);
6 | this.jumpMod = 2;
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/tests/server.ts:
--------------------------------------------------------------------------------
1 | import express from "express";
2 | import {AddressInfo} from "net";
3 |
4 | const app = express();
5 |
6 | app.get("/", (req, res) => {
7 | res.send(`
8 |
9 |
27 | `);
28 | });
29 |
30 | app.get("/rate_limit", (req, res) => {
31 | res.send(
32 | JSON.stringify({
33 | status: "fail",
34 | }),
35 | );
36 | });
37 |
38 | app.get("/invalid_json", (req, res) => {
39 | res.send("invalid");
40 | });
41 |
42 | app.get("/non_object", (req, res) => {
43 | res.send("1");
44 | });
45 |
46 | app.get("/no_next_page", (req, res) => {
47 | res.send(
48 | JSON.stringify({
49 | data: {
50 | end_cursor: "cursor",
51 | has_next_page: false,
52 | },
53 | }),
54 | );
55 | });
56 |
57 | app.get("/duplicate_ids", (req, res) => {
58 | res.send(
59 | JSON.stringify({
60 | data: {
61 | edges: [
62 | {
63 | node: {
64 | id: "1",
65 | },
66 | },
67 | {
68 | node: {
69 | id: "1",
70 | },
71 | },
72 | ],
73 | end_cursor: "cursor",
74 | has_next_page: true,
75 | },
76 | }),
77 | );
78 | });
79 |
80 | app.get("/invalid_id", (req, res) => {
81 | res.send(
82 | JSON.stringify({
83 | data: {
84 | edges: [
85 | {
86 | node: {
87 | id: "badid",
88 | },
89 | },
90 | ],
91 | end_cursor: "cursor",
92 | has_next_page: false,
93 | },
94 | }),
95 | );
96 | });
97 |
98 | app.get("/invalid_page", (req, res) => {
99 | res.send("Sorry, this page isn't available. ");
100 | });
101 |
102 | let listener;
103 |
104 | export async function startServer(): Promise {
105 | await new Promise((resolve) => {
106 | listener = app.listen(0, resolve);
107 | });
108 |
109 | return (listener.address() as AddressInfo).port;
110 | }
111 |
112 | export async function stopServer() {
113 | await new Promise((resolve) => {
114 | listener.close(resolve);
115 | });
116 | }
117 |
--------------------------------------------------------------------------------
/tests/test.spec.ts:
--------------------------------------------------------------------------------
1 | import * as t from "io-ts";
2 | import {launch, Overrides, Request} from "puppeteer";
3 | import * as winston from "winston";
4 | import {createApi, IPlugin} from "..";
5 | import {plugins} from "..";
6 | import {IPluginContext} from "../plugins";
7 | import {IOptions, IOptionsFullApi} from "../src/api/api";
8 | import {FakePage, IFakePageOptions} from "./__fixtures__/FakePage";
9 | import {QuickGraft} from "./__fixtures__/QuickGraft";
10 | import {startServer, stopServer} from "./server";
11 |
12 | jest.setTimeout(8 * 60 * 1000);
13 | /* tslint:disable:no-console */
14 |
15 | const hashtags = ["beach", "gym", "puppies", "party", "throwback"];
16 | const users = ["snoopdogg", "arianagrande", "bbc", "whitehouse", "australia"];
17 | const posts = [
18 | "By54GDoHGzK",
19 | "Be3rTNplCHf",
20 | "BlBvw2_jBKp",
21 | "Bzi33wDnxOz",
22 | "BfzEfy-lK1N",
23 | "Bneu_dCHVdn",
24 | "Brx-adXA9C1",
25 | "Bz5flRagYQt",
26 | "BmRZH7NFwi6",
27 | "BpiIJCUnYwy",
28 | ];
29 |
30 | let smallSize = 10;
31 | let mediumSize = 100;
32 | let largeSize = 1000;
33 |
34 | // Run faster unless executing in CI
35 | if (!process.env.CI) {
36 | smallSize /= 10;
37 | mediumSize /= 10;
38 | largeSize /= 10;
39 | }
40 |
41 | const browserPath = process.env.CHROME
42 | ? process.env.CHROME
43 | : "/usr/bin/google-chrome";
44 |
45 | // Name of an account with 0 posts to test graceful exit
46 | const emptyAccountName = "emptyaccount";
47 |
48 | const createLogger = () =>
49 | winston.createLogger({
50 | format: winston.format.json(),
51 | level: "debug",
52 | silent: false,
53 | transports: [
54 | new winston.transports.File({
55 | filename: "instamancer_tests.log",
56 | level: "debug",
57 | }),
58 | new winston.transports.Console({
59 | level: "error",
60 | }),
61 | ],
62 | });
63 | const testWrapperLogger = createLogger();
64 |
65 | const libraryTestOptions: IOptions = {
66 | logger: createLogger(),
67 | silent: true,
68 | strict: true,
69 | total: 10,
70 | };
71 |
72 | /**
73 | * Used to debug stalled builds in travis
74 | * @param name Test name
75 | * @param callback Test function
76 | */
77 | function testWrapper(name: string, callback: () => Promise) {
78 | test(name, async () => {
79 | const logSignPost = `JEST: Testing ${name}`;
80 | if (process.env.CI) {
81 | console.log(logSignPost);
82 | testWrapperLogger.info(logSignPost);
83 | }
84 |
85 | await callback();
86 | });
87 | }
88 |
89 | describe("Library Classes", () => {
90 | const total = 10;
91 | const objects = {
92 | hashtag: createApi("hashtag", hashtags[0], libraryTestOptions),
93 | post: createApi("post", posts, libraryTestOptions),
94 | user: createApi("user", users[0], libraryTestOptions),
95 | };
96 |
97 | for (const [key, object] of Object.entries(objects)) {
98 | testWrapper(key, async () => {
99 | const scraped = [];
100 | for await (const post of object.generator()) {
101 | expect(post).toBeDefined();
102 | scraped.push(post);
103 | }
104 | expect(scraped.length).toBe(total);
105 | });
106 | }
107 | });
108 |
109 | describe("Library Functions", () => {
110 | const total = 10;
111 | const generators = {
112 | hashtag: createApi(
113 | "hashtag",
114 | hashtags[0],
115 | libraryTestOptions,
116 | ).generator(),
117 | post: createApi("post", posts, libraryTestOptions).generator(),
118 | user: createApi("user", users[0], libraryTestOptions).generator(),
119 | };
120 |
121 | for (const [key, generator] of Object.entries(generators)) {
122 | testWrapper(key, async () => {
123 | const scraped = [];
124 | for await (const post of generator) {
125 | expect(post).toBeDefined();
126 | scraped.push(post);
127 | }
128 | expect(scraped.length).toBe(total);
129 | });
130 | }
131 | });
132 |
133 | describe("Full API", () => {
134 | const total = 10;
135 | const fullApiOption: IOptionsFullApi = {
136 | ...libraryTestOptions,
137 | fullAPI: true,
138 | };
139 | const generators = {
140 | hashtag: createApi("hashtag", hashtags[0], fullApiOption).generator(),
141 | post: createApi("post", posts, fullApiOption).generator(),
142 | user: createApi("user", users[0], fullApiOption).generator(),
143 | };
144 |
145 | for (const [key, generator] of Object.entries(generators)) {
146 | testWrapper(key, async () => {
147 | const scraped = [];
148 | for await (const post of generator) {
149 | expect(post).toBeDefined();
150 | scraped.push(post);
151 | }
152 | expect(scraped.length).toBe(total);
153 | });
154 | }
155 | });
156 |
157 | testWrapper("Account with < 10 photos", async () => {
158 | // This is a not well-known account and it can be deleted at any moment
159 | // If this test starts to fail, need to find another user
160 | // which has less then 10 photos
161 | const id = "zhiznizmelochei";
162 | const fullApiOption: IOptionsFullApi = {
163 | ...libraryTestOptions,
164 | fullAPI: true,
165 | };
166 | const api = createApi("user", id, fullApiOption);
167 | const scraped = [];
168 | for await (const post of api.generator()) {
169 | expect(post).toBeDefined();
170 | scraped.push(post);
171 | }
172 | expect(scraped.length).toBeGreaterThan(0);
173 | // If this user will start to do new posts
174 | // Need to find a new one
175 | expect(scraped.length).toBeLessThan(10);
176 | });
177 |
178 | describe("API limits", () => {
179 | class ApiTestConditions {
180 | public api: "hashtag" | "user";
181 | public ids: string[];
182 | public sizes: number[];
183 |
184 | constructor(api: "hashtag" | "user", ids: string[], sizes: number[]) {
185 | this.api = api;
186 | this.ids = ids;
187 | this.sizes = sizes;
188 | }
189 | }
190 |
191 | const endpoints: ApiTestConditions[] = [
192 | new ApiTestConditions("hashtag", hashtags, [largeSize]),
193 | new ApiTestConditions("user", users, [mediumSize]),
194 | ];
195 |
196 | for (const endpoint of endpoints) {
197 | // Get params
198 | const sourceApi = endpoint.api;
199 | const ids = endpoint.ids;
200 | const sizes = endpoint.sizes;
201 |
202 | for (const size of sizes) {
203 | // Decide how many ids to test based on size
204 | let sizeIds;
205 | let splitLen = 5;
206 | if (size === mediumSize) {
207 | splitLen = 3;
208 | } else if (size === largeSize) {
209 | splitLen = 1;
210 | }
211 | sizeIds = ids.slice(0, splitLen);
212 |
213 | for (const id of sizeIds) {
214 | testWrapper(`${endpoint.api} ${id} ${size}`, async () => {
215 | // Specify API options
216 | const options: IOptions = {
217 | enableGrafting: true,
218 | fullAPI: false,
219 | headless: true,
220 | logger: createLogger(),
221 | silent: false,
222 | sleepTime: 2,
223 | strict: true,
224 | total: size,
225 | };
226 |
227 | // Create API
228 | const api = createApi(sourceApi, id, options);
229 |
230 | // Get posts
231 | const scraped = [];
232 | const postIds = new Set();
233 | for await (const post of api.generator()) {
234 | postIds.add(post.node.id);
235 | scraped.push(post);
236 | }
237 |
238 | // Assert sizes
239 | expect(scraped.length).toBe(size);
240 |
241 | // Check duplicates
242 | expect(scraped.length).toBe(postIds.size);
243 | });
244 | }
245 | }
246 | }
247 | });
248 |
249 | describe("API options", () => {
250 | const hashtagId = "vetinari";
251 | const total = 50;
252 | const optionsCollection: [string, IOptions][] = [
253 | ["No options", {}],
254 | ["Silence", {silent: true, total}],
255 | ["Sleep", {sleepTime: 5, total}],
256 | ["Headless", {headless: false, total}],
257 | ["Grafting", {enableGrafting: false, total}],
258 | ["Executable path", {executablePath: browserPath, total}],
259 | ["Full api", {fullAPI: true, total}],
260 | ["Limited full api", {fullAPI: true, total: 5}],
261 | ];
262 |
263 | for (const [index, [name, options]] of optionsCollection.entries()) {
264 | testWrapper(name, async () => {
265 | // @ts-ignore
266 | const tag = createApi("hashtag", hashtagId, options);
267 | const scraped = [];
268 |
269 | for await (const post of tag.generator()) {
270 | expect(post).toBeDefined();
271 | scraped.push(post);
272 | }
273 |
274 | if (index === 0) {
275 | expect(scraped.length).toBeGreaterThan(total);
276 | } else if (index === optionsCollection.length - 1) {
277 | expect(scraped.length).toBe(5);
278 | } else {
279 | expect(scraped.length).toBe(total);
280 | }
281 | });
282 | }
283 | });
284 |
285 | describe("Unusual behavior", () => {
286 | testWrapper("Empty page", async () => {
287 | const user = createApi("user", emptyAccountName, {}).generator();
288 | const userPosts = [];
289 | for await (const post of user) {
290 | userPosts.push(post);
291 | }
292 | expect(userPosts.length).toBe(0);
293 | });
294 |
295 | testWrapper("No grafting", async () => {
296 | const total = 100;
297 | const hashtag = hashtags[0];
298 | const api = new QuickGraft(hashtag, {total, enableGrafting: false});
299 | const scraped = [];
300 |
301 | for await (const post of api.generator()) {
302 | scraped.push(post);
303 | }
304 |
305 | expect(scraped.length).toBe(total);
306 | });
307 |
308 | testWrapper("Pausing", async () => {
309 | const api = createApi("hashtag", hashtags[0], {total: 100});
310 | const iterator = api.generator();
311 |
312 | api.pause();
313 | setTimeout(() => {
314 | api.pause();
315 | }, 20000);
316 |
317 | for await (const post of iterator) {
318 | expect(post).toBeDefined();
319 | }
320 | });
321 |
322 | testWrapper("Hibernation", async () => {
323 | const options: IOptions = {
324 | hibernationTime: 10,
325 | total: smallSize,
326 | };
327 |
328 | const api = createApi("hashtag", hashtags[0], options);
329 | const iterator = api.generator();
330 |
331 | await iterator.next();
332 | api.toggleHibernation();
333 |
334 | for await (const post of iterator) {
335 | expect(post).toBeDefined();
336 | }
337 | });
338 |
339 | testWrapper("Failed Page visit", async () => {
340 | const options: IOptions = {
341 | proxyURL: "127.0.0.1:9999",
342 | };
343 | const api = createApi("hashtag", hashtags[0], options);
344 | const scraped = [];
345 |
346 | try {
347 | for await (const post of api.generator()) {
348 | scraped.push(post);
349 | }
350 | } catch (e) {
351 | expect(e).toBeDefined();
352 | }
353 |
354 | expect(scraped.length).toBe(0);
355 | });
356 | });
357 |
358 | describe("Network and API issues", () => {
359 | async function testOptions(options: IFakePageOptions) {
360 | options.port = await startServer();
361 | const api = new FakePage(options);
362 | const mock = jest.fn();
363 |
364 | try {
365 | for await (const post of api.generator()) {
366 | mock(post);
367 | }
368 | } catch (e) {
369 | expect(e).toBeDefined();
370 | }
371 | await api.forceStop();
372 |
373 | await stopServer();
374 | }
375 |
376 | testWrapper("Rate limit", async () => {
377 | await testOptions({
378 | catchPage: "rate_limit",
379 | options: {hibernationTime: 10},
380 | });
381 | });
382 |
383 | testWrapper("Invalid JSON", async () => {
384 | await testOptions({catchPage: "invalid_json"});
385 | });
386 |
387 | testWrapper("Non object", async () => {
388 | await testOptions({catchPage: "non_object"});
389 | });
390 |
391 | testWrapper("No next page", async () => {
392 | await testOptions({catchPage: "no_next_page", pageQuery: "data"});
393 | });
394 |
395 | testWrapper("Duplicate post ids", async () => {
396 | await testOptions({
397 | catchPage: "duplicate_ids",
398 | edgeQuery: "data.edges",
399 | pageQuery: "data",
400 | });
401 | });
402 |
403 | testWrapper("Invalid post id", async () => {
404 | await testOptions({
405 | catchPage: "invalid_id",
406 | edgeQuery: "data.edges",
407 | options: {fullAPI: true, total: 1},
408 | pageQuery: "data",
409 | });
410 | });
411 |
412 | testWrapper("Invalid page", async () => {
413 | await testOptions({
414 | path: "/invalid_page",
415 | });
416 | });
417 | });
418 |
419 | describe("Strict mode", () => {
420 | const failingValidator = t.type({
421 | foo: t.string,
422 | });
423 |
424 | testWrapper(
425 | "Should fire warning if strict is false and validations are different",
426 | async () => {
427 | const logger = createLogger();
428 | logger.warn = jest.fn();
429 | const iterator = createApi("hashtag", hashtags[0], {
430 | logger,
431 | strict: false,
432 | total: 1,
433 | validator: failingValidator,
434 | }).generator();
435 |
436 | let i = 0;
437 | for await (const post of iterator) {
438 | i++;
439 | expect(logger.warn).toBeCalledTimes(i);
440 | }
441 | },
442 | );
443 |
444 | testWrapper(
445 | "Should not fire warning if strict is false and validations are ok",
446 | async () => {
447 | const logger = createLogger();
448 | logger.warn = jest.fn();
449 | const iterator = createApi("hashtag", hashtags[0], {
450 | logger,
451 | strict: false,
452 | total: 1,
453 | }).generator();
454 |
455 | for await (const post of iterator) {
456 | expect(logger.warn).toBeCalledTimes(0);
457 | }
458 | },
459 | );
460 |
461 | testWrapper(
462 | "Should throw validation error if strict is true and types are incorrect",
463 | async () => {
464 | expect.hasAssertions();
465 | const iterator = createApi("hashtag", hashtags[0], {
466 | strict: true,
467 | total: 1,
468 | validator: failingValidator,
469 | }).generator();
470 |
471 | try {
472 | await iterator.next();
473 | } catch (e) {
474 | expect(e).toBeInstanceOf(Error);
475 | expect(e.message).toMatch(/^Invalid value/);
476 | }
477 | },
478 | );
479 |
480 | testWrapper(
481 | "Should throw validation error if strict is true and types are incorrect (Post)",
482 | async () => {
483 | expect.hasAssertions();
484 | const iterator = createApi("post", posts, {
485 | strict: true,
486 | total: 1,
487 | validator: failingValidator,
488 | }).generator();
489 |
490 | try {
491 | await iterator.next();
492 | } catch (e) {
493 | expect(e).toBeInstanceOf(Error);
494 | expect(e.message).toMatch(/^Invalid value/);
495 | }
496 | },
497 | );
498 |
499 | testWrapper(
500 | "Should throw validation error if strict is true and types are incorrect (Full Mode)",
501 | async () => {
502 | expect.hasAssertions();
503 | const iterator = createApi("hashtag", hashtags[0], {
504 | fullAPI: true,
505 | strict: true,
506 | total: 1,
507 | validator: failingValidator,
508 | }).generator();
509 |
510 | try {
511 | await iterator.next();
512 | } catch (e) {
513 | expect(e).toBeInstanceOf(Error);
514 | expect(e.message).toMatch(/^Invalid value/);
515 | }
516 | },
517 | );
518 | });
519 |
520 | describe("Search", () => {
521 | testWrapper("Search Result Users", async () => {
522 | const result = await createApi(
523 | "search",
524 | "therock",
525 | libraryTestOptions,
526 | ).get();
527 | expect(result.users.length).toBeGreaterThan(0);
528 | const user = result.users[0].user;
529 | expect(user.username).toBe("therock");
530 | expect(user.full_name).toBeTruthy();
531 | expect(user.profile_pic_url).toBeTruthy();
532 | });
533 |
534 | testWrapper("Search Result Hashtags", async () => {
535 | const result = await createApi(
536 | "search",
537 | "nofilter",
538 | libraryTestOptions,
539 | ).get();
540 | expect(result.hashtags.length).toBeGreaterThan(0);
541 | const hashtag = result.hashtags[0].hashtag;
542 | expect(hashtag.media_count).not.toBeUndefined();
543 | expect(hashtag.name).toBe("nofilter");
544 | });
545 |
546 | testWrapper("Search Result Places", async () => {
547 | const result = await createApi(
548 | "search",
549 | "New york",
550 | libraryTestOptions,
551 | ).get();
552 | expect(result.places.length).toBeGreaterThan(0);
553 | const place = result.places[0].place;
554 | expect(place.title).toMatch(/New York/);
555 | });
556 |
557 | testWrapper("Incorrect validation", async () => {
558 | const failingValidator = t.type({
559 | foo: t.string,
560 | });
561 |
562 | expect.hasAssertions();
563 | const search = createApi("search", "Doesn't matter", {
564 | strict: true,
565 | validator: failingValidator,
566 | });
567 |
568 | try {
569 | await search.get();
570 | } catch (e) {
571 | expect(e).toBeInstanceOf(Error);
572 | expect(e.message).toMatch(/^Invalid value/);
573 | }
574 | await search.forceStop();
575 | });
576 |
577 | testWrapper("Search should fire only one network request", async () => {
578 | const searchRequestsSpy = jest.fn();
579 |
580 | class RequestCounter implements IPlugin {
581 | public async requestEvent(
582 | this: IPluginContext, PostType>,
583 | req: Request,
584 | overrides: Overrides,
585 | ) {
586 | if (this.state.matchURL(req.url())) {
587 | searchRequestsSpy();
588 | }
589 | }
590 | }
591 |
592 | const search = createApi(
593 | "search",
594 | "A really long long long string to find something in Instagram",
595 | {
596 | plugins: [new RequestCounter()],
597 | },
598 | );
599 |
600 | await search.get();
601 | expect(searchRequestsSpy).toBeCalledTimes(1);
602 | });
603 | });
604 |
605 | describe("Plugins", () => {
606 | testWrapper("Internal plugins", async () => {
607 | for (const plugin in plugins) {
608 | if (!plugins.hasOwnProperty(plugin)) {
609 | continue;
610 | }
611 |
612 | const options: IOptions = {
613 | plugins: [new plugins[plugin]()],
614 | silent: true,
615 | total: 100,
616 | };
617 | const hashtag = createApi("hashtag", hashtags[0], options);
618 |
619 | const mock = jest.fn();
620 | for await (const post of hashtag.generator()) {
621 | mock(post);
622 | }
623 | expect(mock).toBeCalledTimes(100);
624 | }
625 | });
626 | });
627 |
628 | describe("Browser instance passed from outside", () => {
629 | const browserOptions = {
630 | headless: true,
631 | args: ["--no-sandbox", "--disable-setuid-sandbox"],
632 | };
633 | testWrapper("Should re-use this browser instance", async () => {
634 | const browser = await launch(browserOptions);
635 |
636 | const hashtagGenerator = createApi("hashtag", hashtags[0], {
637 | browserInstance: browser,
638 | }).generator();
639 | await hashtagGenerator.next();
640 |
641 | const pages = await browser.pages();
642 |
643 | expect(pages.length).toBe(2);
644 |
645 | await browser.close();
646 | });
647 |
648 | testWrapper("Should not close browser instance", async () => {
649 | const browser = await launch(browserOptions);
650 |
651 | const searchGenerator = createApi("search", "therock", {
652 | browserInstance: browser,
653 | }).generator();
654 | await searchGenerator.next();
655 |
656 | expect(browser.isConnected()).toBe(true);
657 |
658 | await browser.close();
659 | });
660 | });
661 |
--------------------------------------------------------------------------------
/tests/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "module": "commonjs",
4 | "target": "es2018",
5 | "noImplicitAny": false,
6 | "inlineSourceMap": true,
7 | "lib": ["dom", "es2018", "esnext.asynciterable"],
8 | "esModuleInterop": true,
9 | "resolveJsonModule": true
10 | },
11 | "compileOnSave": false
12 | }
13 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "module": "commonjs",
4 | "target": "es2018",
5 | "noImplicitAny": false,
6 | "inlineSourceMap": true,
7 | "lib": ["dom", "es2018", "esnext.asynciterable"],
8 | "resolveJsonModule": true,
9 | "esModuleInterop": true
10 | },
11 | "include": ["*.ts", "!*.d.ts", "src/cli.ts"],
12 | "exclude": ["node_modules", "tests/*", "examples/*"]
13 | }
14 |
--------------------------------------------------------------------------------
/tslint.json:
--------------------------------------------------------------------------------
1 | {
2 | "defaultSeverity": "error",
3 | "extends": [
4 | "tslint:recommended"
5 | ],
6 | "jsRules": {},
7 | "rules": {
8 | // Could eventually be re-enabled by removing Hashtag Location User classes
9 | "max-classes-per-file": [false],
10 |
11 | // This needs to be robust enough to support API changes without refactoring
12 | "no-string-literal": false
13 | },
14 | "rulesDirectory": []
15 | }
--------------------------------------------------------------------------------
/utils/validation-generator/.gitignore:
--------------------------------------------------------------------------------
1 | /input.json
2 | /output.ts
--------------------------------------------------------------------------------
/utils/validation-generator/README.md:
--------------------------------------------------------------------------------
1 | # API validation generator
2 |
3 | > Warning! The output which we get from `transform-json-types` library is not perfect. `output.ts` needs to be checked after the automatic transformation.
4 |
5 | This util is used to automatically generate [io-ts](https://github.com/gcanti/io-ts) runtime and type validations for an actual Instagram API.
6 |
7 | To generate these validations two steps are required:
8 |
9 | * Get an actual Instagram API response and save as json
10 | * Get `io-ts` typings from it
11 |
12 | ## Actual API response
13 |
14 | `ts-node utils/validation-generator/get-input.ts`
15 |
16 | The script will save an actual API response for different endpoints in `input.json` file (gitignored)
17 |
18 | ## Generate typings
19 |
20 | > Warning! By some weird reasons these typings are a little bit screwed. Need to replace Node3 with Node inside Post type to make them ok.
21 |
22 | 1. `ts-node utils/validation-generator/generate.ts` (The script will save typing to `output.ts` file.)
23 | 2. Move all primitive types (which does not use other types, like `ThumbnailResources`, `Owner` and others) to the top of the file, final types (like `Post`) to the bottom of the file and fix all the block-scoped variables order errors manually.
24 | 3. Write typing for FullApiPost (generally it is a SinglePost, but with location as an object)
25 | 4. It is better to make the main type excessive by using [io-ts-excess](https://github.com/goooseman/io-ts-excess). Here's an example:
26 | ```typescript
27 | export const SinglePost = t.type({
28 | shortcode_media: excess(ShortcodeMedia),
29 | });
30 | ```
31 | By make this type excessive, you will get validation error, if some new properties appeared in the API.
32 | 5. Move `SearchResult`, `User`, `Places`, `Hashtags` types to `src/api/search.ts`
33 | 6. Fix the rest of the typings
34 |
35 | ## Fix typings
36 |
37 | To quickly find all the typing errors in the project, you can run `npm test -- -t "Strict mode"` and `npm test -- -t "Full API"`.
38 |
39 | You can get a lot of really verbose errors, like:
40 |
41 | ``` typescript
42 | Invalid value
43 | {"id":"219469050","has_public_page":true,"name":"Costa Nova, Aveiro, Portugal","slug":"costa-nova-aveiro-portugal","address_json":"{\"street_address\": \"\", \"zip_code\": \"\", \"city_name\": \"Costa Nova, Aveiro, Portugal\", \"region_name\": \"\", \"country_code\": \"PT\", \"exact_city_match\": true, \"exact_region_match\": false, \"exact_country_match\": false}"}
44 | supplied to :
45 | { shortcode_media: { __typename: string, id: string, shortcode: string, dimensions: { height: number, width: number }, gating_info: (string | null), media_preview: (string | null), display_url: string, display_resources: Array<{ src: string, config_width: number, config_height: number }>, accessibility_caption: (string | undefined), is_video: boolean, should_log_client_event: boolean, tracking_token: string, edge_media_to_tagged_user: { edges: Array<{ node: { text: (string | undefined) } }> }, edge_media_to_caption: { edges: Array<{ node: { text: (string | undefined) } }> }, caption_is_edited: boolean, has_ranked_comments: boolean, edge_media_to_parent_comment: ({ count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: ({ id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } & { edge_threaded_comments: { count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } }) }> } | undefined), edge_media_preview_comment: ({ count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } | undefined), comments_disabled: boolean, taken_at_timestamp: number, edge_media_preview_like: { count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> }, edge_media_to_sponsor_user: { edges: Array<{ node: { text: (string | undefined) } }> }, location: (string | null), viewer_has_liked: boolean, viewer_has_saved: boolean, viewer_has_saved_to_collection: boolean, viewer_in_photo_of_you: boolean, viewer_can_reshare: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string, blocked_by_viewer: boolean, followed_by_viewer: boolean, full_name: string, has_blocked_viewer: boolean, is_private: boolean, is_unpublished: boolean, requested_by_viewer: boolean }, is_ad: boolean, edge_web_media_to_related_media: { edges: Array<{ node: { text: (string | undefined) } }> } } }
46 | /shortcode_media: { __typename: string, id: string, shortcode: string, dimensions: { height: number, width: number }, gating_info: (string | null), media_preview: (string | null), display_url: string, display_resources: Array<{ src: string, config_width: number, config_height: number }>, accessibility_caption: (string | undefined), is_video: boolean, should_log_client_event: boolean, tracking_token: string, edge_media_to_tagged_user: { edges: Array<{ node: { text: (string | undefined) } }> }, edge_media_to_caption: { edges: Array<{ node: { text: (string | undefined) } }> }, caption_is_edited: boolean, has_ranked_comments: boolean, edge_media_to_parent_comment: ({ count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: ({ id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } & { edge_threaded_comments: { count: number, page_info: { has_next_page: boolean, end_cursor: (string | null) }, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } }) }> } | undefined), edge_media_preview_comment: ({ count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> } | undefined), comments_disabled: boolean, taken_at_timestamp: number, edge_media_preview_like: { count: number, edges: Array<{ node: { id: string, text: string, created_at: number, did_report_as_spam: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string }, viewer_has_liked: boolean, edge_liked_by: { count: number } } }> }, edge_media_to_sponsor_user: { edges: Array<{ node: { text: (string | undefined) } }> }, location: (string | null), viewer_has_liked: boolean, viewer_has_saved: boolean, viewer_has_saved_to_collection: boolean, viewer_in_photo_of_you: boolean, viewer_can_reshare: boolean, owner: { id: string, is_verified: boolean, profile_pic_url: string, username: string, blocked_by_viewer: boolean, followed_by_viewer: boolean, full_name: string, has_blocked_viewer: boolean, is_private: boolean, is_unpublished: boolean, requested_by_viewer: boolean }, is_ad: boolean, edge_web_media_to_related_media: { edges: Array<{ node: { text: (string | undefined) } }> } }
47 | /location: (string | null)
48 | /1: null
49 | ```
50 |
51 | This looks scary, but let's make it simple. We need just two parts from the output.
52 |
53 | The first one is the text representation of the value, which validator could not validate. It is between `Invalid value` and `supplied to` strings.
54 | The second one is the type of value it has expected, and it can be found after last or one before last `/` sign.
55 |
56 | In our case validator expected `string` or `null`, but an object has been recieved.
57 |
58 | So we can fix the typing in the following way:
59 |
60 | ``` typescript
61 | export const Location = t.type({
62 | id: t.string,
63 | has_public_page: t.boolean,
64 | name: t.string,
65 | slug: t.string,
66 | address_json: t.string,
67 | });
68 | ...
69 | location: t.union([t.string, t.null, Location])
70 |
71 | ```
72 |
--------------------------------------------------------------------------------
/utils/validation-generator/generate.ts:
--------------------------------------------------------------------------------
1 | import {writeFileSync} from "fs";
2 | import {dirname, join} from "path";
3 | import transform from "transform-json-types";
4 | // @ts-ignore
5 | import * as json from "./input.json";
6 |
7 | const getPath = () => join(dirname(__filename), "./output.ts");
8 |
9 | const removeVarFromCode = (code: string, varName: string): string => {
10 | const regexp = new RegExp(`\nconst ${varName} =[^;]+;\n`, "gm");
11 | return code.replace(regexp, "");
12 | };
13 |
14 | const addTypeToCode = (code: string, typeName: string): string => {
15 | return `${code}\nexport type T${typeName} = t.TypeOf;\n`;
16 | };
17 |
18 | const singularizeVarNameInCode = (
19 | code: string,
20 | varNameSingle: string,
21 | ): string => {
22 | const regexp = new RegExp(`${varNameSingle}s`, "gm");
23 | return code.replace(regexp, varNameSingle);
24 | };
25 |
26 | let output = transform(json, {
27 | lang: "io-ts",
28 | });
29 |
30 | output = `import * as t from "io-ts";\n\n${output}`;
31 | output = `// tslint:disable: object-literal-sort-keys\n${output}`;
32 | output = `${output}// tslint:enable: object-literal-sort-keys\n`;
33 | output = removeVarFromCode(output, "RootInterface");
34 | output = removeVarFromCode(output, "Default");
35 | output = output.replace(/^const/gm, "export const");
36 | output = output.replace(/t\.Array/gm, "t.UnknownArray");
37 | output = output.replace(/\ string/gm, " t.string"); // Really weird
38 | output = output.replace(/t\.Integer/gm, "t.number"); // Integer does not have ts type
39 | output = singularizeVarNameInCode(output, "Post");
40 | output = singularizeVarNameInCode(output, "SearchResult");
41 | output = addTypeToCode(output, "Post");
42 | output = addTypeToCode(output, "SinglePost");
43 | output = addTypeToCode(output, "SearchResult");
44 |
45 | writeFileSync(getPath(), output, {
46 | encoding: "utf-8",
47 | });
48 |
--------------------------------------------------------------------------------
/utils/validation-generator/get-input.ts:
--------------------------------------------------------------------------------
1 | import {writeFileSync} from "fs";
2 | import {dirname, join} from "path";
3 | import {createApi} from "../../";
4 |
5 | const getPath = () => join(dirname(__filename), "./input.json");
6 |
7 | const getResult = async () => {
8 | const posts = await getPosts({
9 | hashtagId: "beach",
10 | userId: "snoopdogg",
11 | });
12 |
13 | const singlePosts = await getSinglePosts({
14 | postsIds: [
15 | "BsOGulcndj-",
16 | "Be3rTNplCHf",
17 | "BlBvw2_jBKp",
18 | "Bi-hISIghYe",
19 | "BfzEfy-lK1N",
20 | "Bneu_dCHVdn",
21 | "Brx-adXA9C1",
22 | "BlTYHvXFrvm",
23 | "BmRZH7NFwi6",
24 | "BpiIJCUnYwy",
25 | ],
26 | });
27 |
28 | const searchResults = await getSearch({
29 | queries: ["beach", "nofilter", "donald"],
30 | });
31 |
32 | return {
33 | posts,
34 | searchResults,
35 | singlePosts,
36 | };
37 | };
38 |
39 | const getSearch = async ({queries}: {queries: string[]}) => {
40 | const result = [];
41 | const objects = queries.map((q) => createApi("search", q, {}));
42 | for (const object of objects) {
43 | result.push(await object.get());
44 | }
45 | return result;
46 | };
47 |
48 | const getPosts = async ({
49 | hashtagId,
50 | userId,
51 | }: {
52 | hashtagId: string;
53 | userId: string;
54 | }) => {
55 | const result = [];
56 |
57 | const options = {
58 | total: 10,
59 | };
60 | const objects = [
61 | createApi("hashtag", hashtagId, options),
62 | createApi("user", userId, options),
63 | ];
64 |
65 | for (const object of objects) {
66 | for await (const post of object.generator()) {
67 | result.push(post);
68 | }
69 | }
70 | return result;
71 | };
72 |
73 | const getSinglePosts = async ({postsIds}: {postsIds: string[]}) => {
74 | const result = [];
75 | const post = createApi("post", postsIds, {});
76 | for await (const singlePost of post.generator()) {
77 | result.push(singlePost);
78 | }
79 | return result;
80 | };
81 |
82 | const run = async () => {
83 | const result = await getResult();
84 | const json = JSON.stringify(result, null, 2);
85 | writeFileSync(getPath(), json, {
86 | encoding: "utf-8",
87 | });
88 | };
89 |
90 | // tslint:disable-next-line: no-console
91 | run().catch(console.error);
92 |
--------------------------------------------------------------------------------