├── .eslintrc.json
├── .editorconfig
├── example.js
├── .gitignore
├── package.json
├── test.js
├── README.md
└── index.js


/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": ["plugin:prettier/recommended"],
 3 |   "plugins": ["prettier"],
 4 |   "env": {
 5 |     "node": true
 6 |   },
 7 |   "rules": {
 8 |     "prettier/prettier": "error"
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | charset = utf-8
 6 | end_of_line = lf
 7 | insert_final_newline = true
 8 | trim_trailing_whitespace = true
 9 | 
10 | [*.{js}]
11 | indent_size = 4
12 | indent_style = space
13 | 


--------------------------------------------------------------------------------
/example.js:
--------------------------------------------------------------------------------
 1 | let ig = require("./index");
 2 | 
 3 | ig.scrapeTag('veranda').then(result => {
 4 |     console.dir(result);
 5 | });
 6 | ig.scrapeComment('CPHnIGbBh1k').then(result => {
 7 |     console.dir(result);
 8 | });
 9 | ig.scrapeUserPage('jcvrnd19').then(result => {
10 |     console.dir(result);
11 | });


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | node_modules
 3 | /dist
 4 | package-lock.json
 5 | 
 6 | # local env files
 7 | .env.local
 8 | .env.*.local
 9 | 
10 | # Log files
11 | npm-debug.log*
12 | yarn-debug.log*
13 | yarn-error.log*
14 | 
15 | # Editor directories and files
16 | .idea
17 | .vscode
18 | *.suo
19 | *.ntvs*
20 | *.njsproj
21 | *.sln
22 | *.sw?
23 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "instagram-scraping",
 3 |   "version": "1.0.17",
 4 |   "description": "NPM module for loading media by hashtag without instagram API",
 5 |   "main": "index.js",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git+https://github.com/rzlyp/instagram-scraping.git"
 9 |   },
10 |   "scripts": {
11 |     "test": "mocha -R spec"
12 |   },
13 |   "keywords": [
14 |     "instagram",
15 |     "hashtag",
16 |     "scraping",
17 |     "scrape"
18 |   ],
19 |   "author": "Rizal Yogi Pratama <@rzlyp>",
20 |   "license": "MIT",
21 |   "bugs": {
22 |     "url": "https://github.com/rzlyp/instagram-scraping/issues"
23 |   },
24 |   "homepage": "https://github.com/rzlyp/instagram-scraping#readme",
25 |   "devDependencies": {
26 |     "chai": "3.5.0",
27 |     "eslint": "^6.4.0",
28 |     "eslint-config-prettier": "^6.3.0",
29 |     "eslint-plugin-prettier": "^3.1.1",
30 |     "mocha": "3.1.2",
31 |     "nock": "9.0.2",
32 |     "prettier": "1.18.2"
33 |   },
34 |   "dependencies": {
35 |     "async": "^2.6.0",
36 |     "axios": "^0.21.1",
37 |     "bluebird": "^3.5.1"
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
 1 | var mocha = require("mocha"),
 2 |   assert = require("chai").assert,
 3 |   ig = require("./index");
 4 | 
 5 | var nock = require("nock");
 6 | var api = nock("https://www.instagram.com")
 7 |   .persist()
 8 |   .get("/explore/tags/nrkvalg")
 9 |   .replyWithFile(200, __dirname + "/fixtures/tagPage.html")
10 |   .get(/\/p\/\w+/)
11 |   .replyWithFile(200, __dirname + "/fixtures/postPage.html")
12 |   .get(/\/explore\/locations\/\d+/)
13 |   .replyWithFile(200, __dirname + "/fixtures/locationPage.html");
14 | 
15 | describe("instagram-scraper", function() {
16 |   it("should throw error when called with missing tag argument", function(done) {
17 |     ig.scrapeTag()
18 |       .then(function(result) {
19 |         assert.fail("Promise should be rejected");
20 |         done();
21 |       })
22 |       .catch(function(err) {
23 |         assert.typeOf(err, "error");
24 |         done();
25 |       });
26 |   });
27 | 
28 |   it("should return object containing count, total and media", function(done) {
29 |     ig.scrapeTag("veranda").then(function(result) {
30 |       assert.isAtLeast(result.count, 1);
31 |       assert.isAtLeast(result.total, 1);
32 |       assert.equal(result.media.length, result.count);
33 |       done();
34 |     });
35 |   });
36 | 
37 |   it("should throw error when called with missing code argument", function(done) {
38 |     ig.scrapeTag()
39 |       .then(function(result) {
40 |         assert.fail("Promise should be rejected");
41 |         done();
42 |       })
43 |       .catch(function(err) {
44 |         assert.typeOf(err, "error");
45 |         done();
46 |       });
47 |   });
48 | });
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # instagram-scraping
 2 | 
 3 | [![npm downloads](https://img.shields.io/npm/dt/instagram-scraping.svg)](https://npm.im/instagram-scraping)
 4 | 
 5 | > NodeJS module for loading posts from Instagram by hashtag without API access by means of scraping.
 6 | 
 7 | ## Disclamer
 8 | 
 9 | Instagram has gone to great lengths to prevent scraping and other unauthorized access to their public content. This module is dependant on the markup the public-facing instagram.com. Should that change this module might also stop working as intended. It also only loads the 12 posts that are displayed on first-load without following pagination to load more images. You should take this into consideration when deciding whether this module will work for you.
10 | 
11 | ## Cloud Proxy support
12 | To avoid many scraping errors (`302 redirect to login`, and `429 Rate Limited`) this library now has cloud proxy support (free and paid plans are available, depending on your volume). Subscribe to https://rapidapi.com/neotank/api/instagram130 which is a proxy for Instagram, and specify your RAPIDAPI_KEY to the env variable when launching your scraping script: `RAPIDAPI_KEY=your-key node index.js` - this will route all requests to Instagram through proxies with retries and response quality checks under the hood.
13 | 
14 | ## Installation
15 | 
16 | `npm install instagram-scraping`
17 | 
18 | ## Usage
19 | 
20 | There are some limitation of loading instagram data, but enjoy it. i hope it's will help you.
21 | 
22 | ### Tag Scraping Media
23 | 
24 | ```javascript
25 | var ig = require('instagram-scraping');
26 | 
27 | ig.scrapeTag('veranda').then((result) => {
28 |   console.dir(result);
29 | });
30 | ```
31 | 
32 | Example response:
33 | 
34 | ```json
35 | {
36 |   "total": 54,
37 |   "medias": [
38 |     {
39 |       "media_id": "1684684359967334824",
40 |       "shortcode": "CPHnIGbBh1k",
41 |       "text": "Selamat siang komuni!🙋 Sportakular hadir lagi untuk mengawali 2018 kita ini dengan penuh semangat dan kebersamaan, berikut jadwal-jadwalnya : sportakular Voly Kamis,4 Januari 2018 18.00 sd selesai Lap.telkom pinggir monumen Sportakular Futsal Jumat , 5 Januari 2018 17.30-20.00 Lap. Meteor Sportakular Badminton Sabtu,6 Januari2018 19.00-21.00 Lap.Pdam (pinggir ITB) Dicatet ya setiap jadwal kegiatannya, biar tidak terlewatkan karena sayang banget untuk dilewatkan. 😉 dan untuk cabang olahraga lain bakalan mimin share lagi so stay tuned dan selalu ingat: 'Berpartisipasi = Auto Kece😎😎' salam olahraga! #himaik #Ikberaniberkarya #salamsatuik #menujuIKsehat #unikom #sportakular",
42 |       "comment_count": {
43 |         "count": 0
44 |       },
45 |       "like_count": {
46 |         "count": 10
47 |       },
48 |       "display_url": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/e35/25024357_207155156521690_1744670180115480576_n.jpg?se=7",
49 |       "owner_id": "1648294943",
50 |       "date": 1515050047,
51 |       "thumbnail": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/s640x640/sh0.08/e35/c0.134.1076.1076/25024357_207155156521690_1744670180115480576_n.jpg",
52 |       "thumbnail_resource": [
53 |         {
54 |           "src": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/s150x150/e35/c0.134.1076.1076/25024357_207155156521690_1744670180115480576_n.jpg",
55 |           "config_width": 150,
56 |           "config_height": 150
57 |         },
58 |         {
59 |           "src": "https://instagram.fpku1-1.fna.fbcdn.net/t51.2885-15/s240x240/e35/c0.134.1076.1076/25024357_207155156521690_1744670180115480576_n.jpg",
60 |           "config_width": 240,
61 |           "config_height": 240
62 |         }
63 |       ]
64 |     }
65 |   ]
66 | }
67 | ```
68 | ### Scrape Comment
69 | 
70 | ```javascript
71 | // using shortcode for scraping comment 
72 | ig.scrapeComment('CPHnIGbBh1k').then((result) => {
73 |   console.dir(result);
74 | });
75 | ```
76 | ### Deep Tag Scraping
77 | 
78 | ```javascript
79 | ig.deepScrapeTagPage('veranda').then((result) => {
80 |   console.dir(result);
81 | });
82 | ```
83 | 
84 | ### Scrape User Page
85 | 
86 | ```javascript
87 | // using username for scraping
88 | ig.scrapeUserPage('jscmila').then((result) => {
89 |   console.dir(result);
90 | });
91 | ```
92 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | var axios = require('axios'),
  2 |     Promise = require('bluebird'),
  3 |     async = require('async');
  4 | 
  5 | var rapidApiMode = !!process.env.RAPIDAPI_KEY, 
  6 |     rapidApiURL = 'https://instagram130.p.rapidapi.com/proxy';
  7 | 
  8 | var userURL = 'https://www.instagram.com/',
  9 |     listURL = 'https://www.instagram.com/explore/tags/',
 10 |     postURL = 'https://www.instagram.com/p/',
 11 |     locURL = 'https://www.instagram.com/explore/locations/',
 12 |     dataExp = /window\._sharedData\s?=\s?({.+);<\/script>/;
 13 | 
 14 | var headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'}
 15 | 
 16 | if (rapidApiMode) {
 17 |     headers['x-rapidapi-key'] = process.env.RAPIDAPI_KEY
 18 | }
 19 | 
 20 | var proxifyURL = function (url) {
 21 |     if (!rapidApiMode) return url;
 22 | 
 23 |     return rapidApiURL + '?url=' + encodeURIComponent(url);
 24 | } 
 25 | 
 26 | exports.scrapeUserPage = function (username) {
 27 |     return new Promise(function (resolve, reject) {
 28 |         if (!username) return reject(new Error('Argument "username" must be specified'));
 29 | 
 30 |         axios.get(proxifyURL(userURL + username), { headers }).then((result) => {
 31 |             var data = scrape(result.data);
 32 |             if (data && data.entry_data &&
 33 |                 data.entry_data.ProfilePage &&
 34 |                 data.entry_data.ProfilePage[0] &&
 35 |                 data.entry_data.ProfilePage[0].graphql &&
 36 |                 data.entry_data.ProfilePage[0].graphql.user &&
 37 |                 data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media &&
 38 |                 data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.count > 0 && data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.edges) {
 39 |                 var edges = data.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.edges;
 40 |                 async.waterfall([
 41 |                     (callback) => {
 42 |                         var medias = [];
 43 |                         edges.forEach((post) => {
 44 |                             if (post.node.__typename === 'GraphImage'||post.node.__typename === 'GraphSidecar' || post.node.__typename === 'GraphVideo') {
 45 |                                 medias.push(exports.scrapePostData(post))
 46 |                             }
 47 |                         });
 48 |                         callback(null, medias);
 49 |                     }
 50 |                 ], (err, results) => {
 51 |                     resolve({
 52 |                         total: results.length,
 53 |                         medias: results,
 54 |                         user: data.entry_data.ProfilePage[0].graphql.user
 55 |                     })
 56 |                 })
 57 |             }
 58 |             else {
 59 |                 reject(new Error('Error scraping user page "' + username + '"'));
 60 |             }
 61 |         }).catch((err) => {
 62 |             reject(new Error('Error scraping user page "' + username + '"'));
 63 |         });
 64 |     });
 65 | };
 66 | 
 67 | exports.deepScrapeTagPage = function (tag) {
 68 |     return new Promise(function (resolve, reject) {
 69 |         exports.scrapeTag(tag).then(function (tagPage) {
 70 |             return Promise.map(tagPage.medias, function (media, i, len) {
 71 |                 return exports.scrapePostCode(media.node.shortcode).then(function (postPage) {
 72 |                     tagPage.medias[i] = postPage;
 73 |                     if (postPage.location && postPage.location.has_public_page) {
 74 |                         return exports.scrapeLocation(postPage.location.id).then(function (locationPage) {
 75 |                             tagPage.media[i].location = locationPage;
 76 |                         })
 77 |                             .catch(function (err) {
 78 |                                 console.log("An error occurred calling scrapeLocation inside deepScrapeTagPage" + ":" + err);
 79 |                             });
 80 |                     }
 81 |                 })
 82 |                     .catch(function (err) {
 83 |                         console.log("An error occurred calling scrapePostPage inside deepScrapeTagPage" + ":" + err);
 84 |                     });
 85 |             })
 86 |                 .then(function () { resolve(tagPage); })
 87 |                 .catch(function (err) {
 88 |                     console.log("An error occurred resolving tagPage inside deepScrapeTagPage" + ":" + err);
 89 |                 });
 90 |         })
 91 |             .catch(function (err) {
 92 |                 console.log("An error occurred calling scrapeTagPage inside deepScrapeTagPage" + ":" + err);
 93 |             });
 94 |     });
 95 | };
 96 | 
 97 | exports.scrapeTag = function (tag) {
 98 |     return new Promise(function (resolve, reject) {
 99 |         if (!tag) return reject(new Error('Argument "tag" must be specified'));
100 | 
101 |         axios.get(proxifyURL(listURL + tag), { headers }).then((result) => {
102 |             var data = scrape(result.data);
103 |             var media = data.entry_data && data.entry_data.TagPage && data.entry_data.TagPage[0].graphql.hashtag.edge_hashtag_to_media;
104 | 
105 |             if (data && media) {
106 |                 var edges = media.edges;
107 | 
108 |                 async.waterfall([
109 |                     (callback) => {
110 |                         var medias = [];
111 |                         edges.forEach((post) => {
112 |                             medias.push(exports.scrapePostData(post))
113 |                         });
114 |                         callback(null, medias);
115 |                     }
116 |                 ], (err, results) => {
117 |                     resolve({
118 |                         total: results.length,
119 |                         medias: results
120 |                     })
121 |                 })
122 | 
123 |             }
124 |             else {
125 |                 reject(new Error('Error scraping tag page "' + tag + '"'));
126 |             }
127 |         }).catch((err) => {
128 |             reject(new Error('Error scraping tag page "' + tag + '": ' + err.message));
129 |         });
130 |     });
131 | };
132 | exports.scrapeComment = function (shortcode) {
133 |     return new Promise(function (resolve, reject) {
134 |         if (!shortcode) return reject(new Error('Argument "shortcode" must be specified'));
135 | 
136 |         axios.get(proxifyURL(postURL+shortcode), { headers }).then((result) => {
137 |             var data = scrape(result.data);
138 |             var comments = data.entry_data.PostPage[0].graphql.shortcode_media.edge_media_to_parent_comment;
139 |             if(comments != undefined){
140 |                 let commentList = comments.edges
141 |                 async.waterfall([
142 |                     (callback) => {
143 |                         var medias = [];
144 |                         commentList.forEach((post) => {
145 |                             // console.log(post)
146 |                             medias.push(post)
147 |                         });
148 |                         callback(null, medias);
149 |                     }
150 |                 ], (err, results) => {
151 |                     if(err){
152 |                         reject(new Error('comment not found for "' + page + '"'));
153 |                     }
154 |                     resolve({
155 |                         total: comments.count,
156 |                         medias: results
157 |                     })
158 |                 })
159 |             }else {
160 |                 reject(new Error('comment not found for "' + page + '"'));
161 |             }
162 | 
163 |         }).catch((err) => {
164 |             reject(new Error('Error scraping page "' + page + '"'));
165 |         });
166 |     });
167 | };
168 | 
169 | exports.scrapePostData = function (post) {
170 |     var scrapedData = {
171 |         media_id: post.node.id,
172 |         shortcode: post.node.shortcode,
173 |         text: post.node.edge_media_to_caption.edges[0] && post.node.edge_media_to_caption.edges[0].node.text,
174 |         comment_count: post.node.edge_media_to_comment.count,
175 |         like_count: post.node.edge_liked_by.count,
176 |         display_url: post.node.display_url,
177 |         owner_id: post.node.owner.id,
178 |         date: post.node.taken_at_timestamp,
179 |         thumbnail: post.node.thumbnail_src,
180 |         thumbnail_resource: post.node.thumbnail_resources,
181 |         is_video: post.node.is_video
182 |     }
183 | 
184 |     if (post.node.is_video) {
185 |         scrapedData.video_view_count = post.node.video_view_count;
186 |     }
187 | 
188 |     return post;
189 | }
190 | 
191 | exports.scrapePostCode = function (code) {
192 |     return new Promise(function (resolve, reject) {
193 |         if (!code) return reject(new Error('Argument "code" must be specified'));
194 | 
195 |         axios.get(proxifyURL(postURL + code), { headers }).then((result) => {
196 |             //if (err) return reject(err);
197 | 
198 |             var data = scrape(result.data);
199 |             if (data && data.entry_data &&
200 |                 data.entry_data.PostPage[0] &&
201 |                 data.entry_data.PostPage[0].graphql &&
202 |                 data.entry_data.PostPage[0].graphql.shortcode_media) {
203 |                 resolve(data.entry_data.PostPage[0].graphql.shortcode_media);
204 |             }
205 |             else {
206 |                 reject(new Error('Error scraping post page "' + code + '"'));
207 |             }
208 |         }).catch((err) => {
209 |             reject(new Error('Error scraping post page "' + code + '":' + err));
210 |         });
211 |     });
212 | }
213 | 
214 | exports.scrapeLocation = function (id) {
215 |     return new Promise(function (resolve, reject) {
216 |         if (!id) return reject(new Error('Argument "id" must be specified'));
217 | 
218 |         axios.get(proxifyURL(locURL + id), { headers }).then((result) => {
219 |             var data = scrape(result.data);
220 | 
221 |             if (data && data.entry_data && (typeof data.entry_data.LocationsPage !== "undefined")) {
222 |                 resolve(data.entry_data.LocationsPage[0].location);
223 |             }
224 |             else {
225 |                 reject(new Error('Error scraping location page "' + id + '"'));
226 |             }
227 |         }).catch((err) => {
228 |             reject(new Error('Error scraping user page "' + id + '"'));
229 |         });;
230 |     });
231 | }
232 | var scrape = function (html) {
233 |     try {
234 |         var dataString = html.match(dataExp)[1];
235 |         var json = JSON.parse(dataString);
236 |     }
237 |     catch (e) {
238 |         if (process.env.NODE_ENV != 'production') {
239 |             console.error('The HTML returned from instagram was not suitable for scraping');
240 |         }
241 |         return null
242 |     }
243 | 
244 |     return json;
245 | }
246 | 


--------------------------------------------------------------------------------